diff --git a/Ditto.xcodeproj/project.pbxproj b/Ditto.xcodeproj/project.pbxproj index 836a2d3..5bfe819 100644 --- a/Ditto.xcodeproj/project.pbxproj +++ b/Ditto.xcodeproj/project.pbxproj @@ -11,6 +11,7 @@ CC000001AAAA000000000002 /* CloudSyncManager.swift in Sources */ = {isa = PBXBuildFile; fileRef = CC000001AAAA000000000012 /* CloudSyncManager.swift */; }; CC000001AAAA000000000003 /* SubscriptionView.swift in Sources */ = {isa = PBXBuildFile; fileRef = CC000001AAAA000000000013 /* SubscriptionView.swift */; }; CC000001AAAA000000000004 /* LegacyDataMigrator.swift in Sources */ = {isa = PBXBuildFile; fileRef = CC000001AAAA000000000014 /* LegacyDataMigrator.swift */; }; + CC000001AAAA000000000030 /* WALSidecarRecovery.swift in Sources */ = {isa = PBXBuildFile; fileRef = CC000001AAAA000000000031 /* WALSidecarRecovery.swift */; }; CC000001AAAA000000000016 /* DittoImportExport.swift in Sources */ = {isa = PBXBuildFile; fileRef = CC000001AAAA000000000019 /* DittoImportExport.swift */; }; CC000001AAAA00000000001A /* Localizable.xcstrings in Resources */ = {isa = PBXBuildFile; fileRef = CC000001AAAA00000000001B /* Localizable.xcstrings */; }; CC000001AAAA00000000001C /* SyncSettings.swift in Sources */ = {isa = PBXBuildFile; fileRef = CC000001AAAA00000000001D /* SyncSettings.swift */; }; @@ -111,6 +112,7 @@ CC000001AAAA000000000012 /* CloudSyncManager.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CloudSyncManager.swift; sourceTree = ""; }; CC000001AAAA000000000013 /* SubscriptionView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SubscriptionView.swift; sourceTree = ""; }; CC000001AAAA000000000014 /* LegacyDataMigrator.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LegacyDataMigrator.swift; sourceTree = ""; }; + CC000001AAAA000000000031 /* WALSidecarRecovery.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WALSidecarRecovery.swift; sourceTree = ""; }; CC000001AAAA000000000019 /* DittoImportExport.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DittoImportExport.swift; sourceTree = ""; }; CC000001AAAA00000000001B /* Localizable.xcstrings */ = {isa = PBXFileReference; lastKnownFileType = text.json.xcstrings; path = Localizable.xcstrings; sourceTree = ""; }; CC000001AAAA00000000001D /* SyncSettings.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SyncSettings.swift; sourceTree = ""; }; @@ -291,6 +293,7 @@ CC000001AAAA000000000012 /* CloudSyncManager.swift */, CC000001AAAA000000000013 /* SubscriptionView.swift */, CC000001AAAA000000000014 /* LegacyDataMigrator.swift */, + CC000001AAAA000000000031 /* WALSidecarRecovery.swift */, CC000001AAAA000000000019 /* DittoImportExport.swift */, CC000001AAAA00000000001B /* Localizable.xcstrings */, CC000001AAAA00000000001D /* SyncSettings.swift */, @@ -572,6 +575,7 @@ CC000001AAAA000000000002 /* CloudSyncManager.swift in Sources */, CC000001AAAA000000000003 /* SubscriptionView.swift in Sources */, CC000001AAAA000000000004 /* LegacyDataMigrator.swift in Sources */, + CC000001AAAA000000000030 /* WALSidecarRecovery.swift in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/Ditto/DittoListView.swift b/Ditto/DittoListView.swift index cd97f9e..d23a8c2 100644 --- a/Ditto/DittoListView.swift +++ b/Ditto/DittoListView.swift @@ -88,20 +88,20 @@ struct DittoListView: View { Label("Set Up Keyboard", systemImage: KeyboardSetupStatus.hasFullAccess ? "keyboard.fill" : "keyboard") } - if LegacyDataMigrator.hasRecoverableLegacyData { - Button { - // Show the preview confirmation if we can read the - // legacy store; otherwise fall straight to the - // attempt-and-report-result flow so the user sees - // *why* recovery failed instead of a missing menu. - if let preview = LegacyDataMigrator.previewRecoverableData() { - legacyRecoveryPreview = preview - } else { - runLegacyRecovery() - } - } label: { - Label("Recover Old Dittos", systemImage: "tray.and.arrow.down") + // Always offer the entry point, even when we can't see a + // legacy store on disk. Users whose 2.x data was destroyed + // by the 3.0.0 cleanup will still tap here looking for it; + // runLegacyRecovery() reports the structured result + // (nothingOnDisk / foundButUnreadable / emptyStore / inserted) + // so they get a real explanation instead of a missing menu. + Button { + if let preview = LegacyDataMigrator.previewRecoverableData() { + legacyRecoveryPreview = preview + } else { + runLegacyRecovery() } + } label: { + Label("Recover Old Dittos", systemImage: "tray.and.arrow.down") } Button { @@ -207,10 +207,17 @@ struct DittoListView: View { } } message: { if let preview = legacyRecoveryPreview { - Text( - // swiftlint:disable:next line_length - "Found \(preview.dittoCount) dittos across \(preview.categoryCount) categories from your previous version of Ditto. Recovering will merge them into your current library; duplicates will be skipped." - ) + if preview.isWALRecovery { + Text( + // swiftlint:disable:next line_length + "Found \(preview.dittoCount) recoverable phrases in a backup file from your previous version of Ditto. The original category structure couldn't be preserved — recovered phrases will be added to a single \"Recovered\" category for you to re-organize." + ) + } else { + Text( + // swiftlint:disable:next line_length + "Found \(preview.dittoCount) dittos across \(preview.categoryCount) categories from your previous version of Ditto. Recovering will merge them into your current library; duplicates will be skipped." + ) + } } } .alert("Recovery Complete", isPresented: .init( diff --git a/Ditto/LegacyDataMigrator.swift b/Ditto/LegacyDataMigrator.swift index fe17378..f0b56be 100644 --- a/Ditto/LegacyDataMigrator.swift +++ b/Ditto/LegacyDataMigrator.swift @@ -1,3 +1,10 @@ +// swiftlint:disable file_length type_body_length +// +// This type concentrates discovery / read / write / telemetry for the v2 Core Data +// migration AND the orphan-WAL fallback in one place because they share state +// (App Group resolution, completion flag, logging category, outcome telemetry). +// Splitting them would force that state to be passed around or duplicated. + import CoreData import Foundation import OSLog @@ -52,30 +59,38 @@ enum LegacyDataMigrator { return exists } - /// True if a legacy SQLite file is on disk *anywhere* we know to look, regardless of - /// whether we can actually open it. The "Recover Old Dittos" menu item uses this so - /// users with an unreadable-but-present store still see the entry point — they get a - /// useful error from `recoverNow` instead of a silently-missing menu item. - static var hasRecoverableLegacyData: Bool { - legacyStoreURL != nil - } - /// Snapshot for the confirmation alert. struct RecoveryPreview { let categoryCount: Int let dittoCount: Int + /// True when the only thing on disk is an orphaned WAL sidecar (no main + /// `.sqlite`). Category structure is unrecoverable in this mode — items + /// land in a single "Recovered" bucket — so the UI surfaces a different + /// confirmation message. + let isWALRecovery: Bool } /// Reads the legacy store (without mutating it) and returns how many categories / - /// dittos would be imported. Returns nil if there's no recoverable store on disk - /// or the file is present but unreadable. + /// dittos would be imported. When the main `.sqlite` is missing but a `-wal` + /// sidecar exists, falls back to a WAL-frame extraction preview. Returns nil if + /// nothing is recoverable from either path. static func previewRecoverableData() -> RecoveryPreview? { - guard let url = legacyStoreURL else { return nil } - guard let legacy = try? readLegacyStore(at: url), !legacy.isEmpty else { return nil } - return RecoveryPreview( - categoryCount: legacy.count, - dittoCount: legacy.reduce(0) { $0 + $1.dittos.count } - ) + if let url = legacyStoreURL, + let legacy = try? readLegacyStore(at: url), + !legacy.isEmpty { + return RecoveryPreview( + categoryCount: legacy.count, + dittoCount: legacy.reduce(0) { $0 + $1.dittos.count }, + isWALRecovery: false + ) + } + if let walURL = findOrphanWAL() { + let phrases = WALSidecarRecovery.extractPhrases(from: walURL) + if !phrases.isEmpty { + return RecoveryPreview(categoryCount: 1, dittoCount: phrases.count, isWALRecovery: true) + } + } + return nil } /// Outcome of a manual recovery attempt. Surfaced in the UI so users see *why* nothing @@ -114,8 +129,15 @@ enum LegacyDataMigrator { /// flag. Never deletes the legacy store. Returns a structured result so the UI can /// distinguish "nothing on disk" from "file present but unreadable" from "successfully /// imported N dittos". + /// + /// If the main `.sqlite` is gone but a `-wal` sidecar survives, falls through to a raw + /// WAL-frame extraction that pulls candidate phrases out of the journal directly. + /// Category structure is lost in that mode — phrases land in a "Recovered" bucket. static func recoverNow(into context: ModelContext) -> RecoveryResult { guard let storeURL = legacyStoreURL else { + if let walURL = findOrphanWAL() { + return runWALSidecarRecovery(at: walURL, into: context) + } log.info("recoverNow: no legacy store on disk") logOutcome(source: "manual", outcome: "nothing_on_disk") return .nothingOnDisk @@ -324,6 +346,28 @@ enum LegacyDataMigrator { return nil } + /// Returns the URL of a `.sqlite-wal` file that has no main `.sqlite` alongside it. + /// Distinct from `findOrphanWALOrSHM` — that one's for diagnostic logging and accepts + /// either sidecar; this one only returns `-wal`, the only file that actually carries + /// page data we can extract from. SHM files contain just the WAL index and are useless + /// without the WAL. + private static func findOrphanWAL() -> URL? { + let fm = FileManager.default + guard let groupURL = fm.containerURL(forSecurityApplicationGroupIdentifier: appGroupIdentifier) else { + return nil + } + let names = ["Ditto.sqlite", "ditto.sqlite", "Ditto.SQLite"] + for base in names { + let walURL = groupURL.appendingPathComponent(base + "-wal") + guard fm.fileExists(atPath: walURL.path) else { continue } + let main = groupURL.appendingPathComponent(base) + if !fm.fileExists(atPath: main.path) { + return walURL + } + } + return nil + } + private static func fileSize(at url: URL) -> Int64 { let attrs = try? FileManager.default.attributesOfItem(atPath: url.path) return (attrs?[.size] as? NSNumber)?.int64Value ?? 0 @@ -562,4 +606,91 @@ enum LegacyDataMigrator { private static func markComplete() { UserDefaults(suiteName: appGroupIdentifier)?.set(true, forKey: migrationCompleteKey) } + + // MARK: - WAL sidecar recovery + + /// Title of the bucket category created when WAL-frame recovery succeeds. Surfaced + /// to the user in the confirmation alert so they know where to look. + static let walRecoveryCategoryTitle = "Recovered" + + /// Manual recovery from an orphaned `-wal` file. Pulls text out of WAL frame + /// payloads directly via `WALSidecarRecovery`, then writes the phrases into a + /// single `walRecoveryCategoryTitle` bucket since category structure is unrecoverable. + private static func runWALSidecarRecovery(at walURL: URL, into context: ModelContext) -> RecoveryResult { + log.info("runWALSidecarRecovery: parsing \(walURL.path, privacy: .public) (\(fileSize(at: walURL), privacy: .public) bytes)") + let phrases = WALSidecarRecovery.extractPhrases(from: walURL) + guard !phrases.isEmpty else { + log.info("runWALSidecarRecovery: WAL parsed but yielded no recoverable phrases") + logOutcome(source: "manual_wal", outcome: "empty_store") + return .emptyStore + } + + let beforeCount = (try? context.fetch(FetchDescriptor()).count) ?? 0 + writeWALPhrases(phrases, into: context) + + do { + try context.save() + } catch { + log.error("runWALSidecarRecovery: save failed: \(error.localizedDescription, privacy: .public)") + logOutcome(source: "manual_wal", outcome: "found_unreadable") + return .foundButUnreadable(error.localizedDescription) + } + + markComplete() + let afterCount = (try? context.fetch(FetchDescriptor()).count) ?? 0 + let inserted = max(0, afterCount - beforeCount) + log.info("runWALSidecarRecovery: inserted \(inserted, privacy: .public) phrases from WAL") + logOutcome( + source: "manual_wal", + outcome: inserted > 0 ? "success" : "no_new_data", + categoriesFound: 1, + dittosFound: phrases.count, + inserted: inserted + ) + return .inserted(inserted) + } + + /// Inserts WAL-extracted phrases into a single "Recovered" category, deduplicating + /// against anything already in that category. + @discardableResult + private static func writeWALPhrases(_ phrases: [String], into context: ModelContext) -> Int { + let profile: Profile + if let existing = (try? context.fetch(FetchDescriptor()))?.first { + profile = existing + } else { + profile = Profile() + context.insert(profile) + } + + let existingByTitle = Dictionary( + profile.orderedCategories.map { ($0.title, $0) }, + uniquingKeysWith: { first, _ in first } + ) + let category: DittoCategory + if let existing = existingByTitle[walRecoveryCategoryTitle] { + category = existing + } else { + category = DittoCategory(title: walRecoveryCategoryTitle, profile: profile) + category.sortOrder = profile.orderedCategories.count + context.insert(category) + profile.categories?.append(category) + } + + let existingTexts = Set((category.dittos ?? []).map { $0.text }) + var nextSort = (category.dittos ?? []).count + var inserted = 0 + for phrase in phrases { + guard !existingTexts.contains(phrase) else { continue } + let item = DittoItem(text: phrase, category: category) + item.sortOrder = nextSort + nextSort += 1 + context.insert(item) + category.dittos?.append(item) + inserted += 1 + } + log.info("writeWALPhrases: inserted \(inserted, privacy: .public) phrases into '\(walRecoveryCategoryTitle, privacy: .public)'") + return inserted + } } + +// swiftlint:enable file_length type_body_length diff --git a/Ditto/WALSidecarRecovery.swift b/Ditto/WALSidecarRecovery.swift new file mode 100644 index 0000000..0195d26 --- /dev/null +++ b/Ditto/WALSidecarRecovery.swift @@ -0,0 +1,222 @@ +import Foundation +import OSLog + +/// Last-resort recovery from an orphaned SQLite `-wal` file whose main `.sqlite` was +/// deleted by the 3.0.0 cleanup. +/// +/// The WAL contains pages that would have been merged back into the main DB on the next +/// checkpoint. We can't reopen them through SQLite (the WAL's salt values reference a +/// DB header we no longer have), so this parses the WAL frames directly and walks the +/// SQLite B-tree leaf pages within them to extract TEXT-typed record values. +/// +/// Format references: +/// - WAL: https://www.sqlite.org/walformat.html +/// - DB pages / records: https://www.sqlite.org/fileformat.html +/// +/// Recovery is best-effort by construction: WAL frame checksums are not verified, and +/// cells whose payload spills onto overflow pages are skipped (we don't have the main +/// DB's free-page list, so we can't follow overflow chains). For typical ditto-length +/// strings none of that matters. +enum WALSidecarRecovery { + + private static let log = Logger(subsystem: "io.kern.ditto", category: "WALSidecarRecovery") + + /// Extracts deduplicated TEXT-typed record values from table B-tree leaf pages + /// within `walURL`. Returned strings are sorted, trimmed, and filtered to drop: + /// - very short strings (`< 2` chars) — pure noise + /// - very long strings (`> 5000` chars) — almost certainly mis-parsed bytes + /// - obvious Core Data internal identifiers (`Z_PRIMARYKEY`, etc.) + static func extractPhrases(from walURL: URL) -> [String] { + guard let data = try? Data(contentsOf: walURL) else { + log.error("extractPhrases: could not read \(walURL.path, privacy: .public)") + return [] + } + return parse([UInt8](data)) + } + + /// Same as `extractPhrases(from:)` but takes raw bytes. Internal entry point so + /// tests can drive the parser against synthetic data without touching disk. + static func parse(_ bytes: [UInt8]) -> [String] { + guard bytes.count >= 32 else { return [] } + // WAL magic: 0x377F0682 (host-endian write) or 0x377F0683 (byte-swapped). + // The header is always stored big-endian on disk regardless. + let magic = readUInt32BE(bytes, at: 0) + guard magic == 0x377F_0682 || magic == 0x377F_0683 else { return [] } + let pageSize = Int(readUInt32BE(bytes, at: 8)) + // SQLite page sizes are powers of two between 512 and 65536. + guard pageSize >= 512, pageSize <= 65536, pageSize.nonzeroBitCount == 1 else { return [] } + + var results: Set = [] + let frameSize = 24 + pageSize + var offset = 32 // past WAL header + while offset + frameSize <= bytes.count { + let pageNumber = readUInt32BE(bytes, at: offset) + let pageBase = offset + 24 + // Page 1 has the 100-byte SQLite DB header before its B-tree header. + let btreeStart = pageNumber == 1 ? 100 : 0 + extractFromPage(bytes, pageBase: pageBase, pageSize: pageSize, btreeStart: btreeStart, into: &results) + offset += frameSize + } + return results.sorted() + } + + // MARK: - B-tree page walker + + private static func extractFromPage( + _ bytes: [UInt8], + pageBase: Int, + pageSize: Int, + btreeStart: Int, + into results: inout Set + ) { + let pageEnd = pageBase + pageSize + let header = pageBase + btreeStart + guard header < bytes.count, header < pageEnd else { return } + // Only table B-tree leaf pages (0x0D) carry the row payloads we want. Index + // pages, interior pages, and freelist pages don't contain user TEXT. + guard bytes[header] == 0x0D else { return } + + let cellCount = Int(readUInt16BE(bytes, at: header + 3)) + // Table-leaf page header is 8 bytes; cell pointer array follows. + let cellPtrStart = header + 8 + for i in 0..= pageBase, cellAbs < pageEnd else { continue } + parseTableLeafCell(bytes, at: cellAbs, pageEnd: pageEnd, into: &results) + } + } + + private static func parseTableLeafCell( + _ bytes: [UInt8], + at offset: Int, + pageEnd: Int, + into results: inout Set + ) { + var cursor = offset + guard let (payloadLength, n1) = readVarint(bytes, at: cursor) else { return } + cursor += n1 + guard let (_, n2) = readVarint(bytes, at: cursor) else { return } + cursor += n2 + + // If the payload extends past the page, this cell uses overflow pages. + // Skip — extracting from overflow without the main DB's free-page map is + // not worth the complexity for typical ditto-sized text. + let payloadEnd = cursor + Int(payloadLength) + guard payloadEnd <= pageEnd, payloadEnd <= bytes.count else { return } + + guard let (serials, bodyStart) = parseRecordHeader(bytes, at: cursor, payloadEnd: payloadEnd) else { return } + extractTexts(bytes, serials: serials, bodyStart: bodyStart, payloadEnd: payloadEnd, into: &results) + } + + /// Parses the record header at `offset`, returning the list of serial-type codes + /// and the offset where the record body starts. Returns nil if the header is + /// truncated or doesn't fit in the payload. + private static func parseRecordHeader( + _ bytes: [UInt8], + at offset: Int, + payloadEnd: Int + ) -> (serials: [UInt64], bodyStart: Int)? { + guard let (headerLength, hLen) = readVarint(bytes, at: offset) else { return nil } + var headerCursor = offset + hLen + let headerEnd = offset + Int(headerLength) + guard headerEnd <= payloadEnd else { return nil } + + var serials: [UInt64] = [] + while headerCursor < headerEnd { + guard let (st, sz) = readVarint(bytes, at: headerCursor) else { return nil } + serials.append(st) + headerCursor += sz + } + return (serials, headerEnd) + } + + /// Walks the record body once per serial type, decoding TEXT-typed fields and + /// inserting the keepable ones into `results`. Stops if any field would read + /// past `payloadEnd`. + private static func extractTexts( + _ bytes: [UInt8], + serials: [UInt64], + bodyStart: Int, + payloadEnd: Int, + into results: inout Set + ) { + var body = bodyStart + for st in serials { + let info = serialTypeInfo(st) + guard body + info.size <= payloadEnd else { return } + if info.isText, info.size > 0, let text = decodeKeepableText(bytes, at: body, size: info.size) { + results.insert(text) + } + body += info.size + } + } + + /// Decodes `size` bytes at `offset` as UTF-8, trims whitespace, and returns the + /// string only if it passes the keepable-content filter (length bounds + not a + /// known Core Data internal identifier). + private static func decodeKeepableText(_ bytes: [UInt8], at offset: Int, size: Int) -> String? { + let slice = Array(bytes[offset..<(offset + size)]) + guard let raw = String(bytes: slice, encoding: .utf8) else { return nil } + let trimmed = raw.trimmingCharacters(in: .whitespacesAndNewlines) + guard trimmed.count >= 2, trimmed.count <= 5000, !isInternalString(trimmed) else { return nil } + return trimmed + } + + private static func serialTypeInfo(_ st: UInt64) -> (size: Int, isText: Bool) { + switch st { + case 0, 8, 9, 10, 11: return (0, false) + case 1: return (1, false) + case 2: return (2, false) + case 3: return (3, false) + case 4: return (4, false) + case 5: return (6, false) + case 6, 7: return (8, false) + default: + guard st >= 12 else { return (0, false) } + return (Int((st - 12) / 2), st % 2 == 1) + } + } + + private static func isInternalString(_ s: String) -> Bool { + // Core Data's Z_PRIMARYKEY / Z_METADATA / Z_MODELCACHE tables store entity + // names and serialized schema state as TEXT — we don't want to surface those + // alongside the user's actual ditto phrases. + let internalPrefixes = ["Z_PRIMARYKEY", "Z_METADATA", "Z_MODELCACHE", "NSStoreType"] + return internalPrefixes.contains { s.hasPrefix($0) } + } + + // MARK: - Byte readers + + private static func readUInt16BE(_ bytes: [UInt8], at offset: Int) -> UInt16 { + guard offset + 1 < bytes.count else { return 0 } + return UInt16(bytes[offset]) << 8 | UInt16(bytes[offset + 1]) + } + + private static func readUInt32BE(_ bytes: [UInt8], at offset: Int) -> UInt32 { + guard offset + 3 < bytes.count else { return 0 } + return UInt32(bytes[offset]) << 24 + | UInt32(bytes[offset + 1]) << 16 + | UInt32(bytes[offset + 2]) << 8 + | UInt32(bytes[offset + 3]) + } + + private static func readVarint(_ bytes: [UInt8], at offset: Int) -> (value: UInt64, bytesRead: Int)? { + var result: UInt64 = 0 + for i in 0..<9 { + guard offset + i < bytes.count else { return nil } + let byte = bytes[offset + i] + if i == 8 { + result = (result << 8) | UInt64(byte) + return (result, 9) + } + result = (result << 7) | UInt64(byte & 0x7F) + if byte & 0x80 == 0 { + return (result, i + 1) + } + } + return nil + } +} diff --git a/DittoTests/LegacyDataMigratorTests.swift b/DittoTests/LegacyDataMigratorTests.swift index d498672..2029a57 100644 --- a/DittoTests/LegacyDataMigratorTests.swift +++ b/DittoTests/LegacyDataMigratorTests.swift @@ -161,21 +161,6 @@ struct LegacyDataMigratorTests { #expect(!LegacyDataMigrator.needsMigration) } - @Test("hasRecoverableLegacyData ignores the completion flag") - func hasRecoverableIgnoresFlag() { - clearFlag() - defer { clearFlag() } - - // The unit test sandbox has no App Group container, so legacyStoreURL is nil. - // What we're verifying here is the negative: with no store on disk, - // hasRecoverableLegacyData is false regardless of the flag state. - appGroupDefaults()?.set(true, forKey: completeKey) - #expect(!LegacyDataMigrator.hasRecoverableLegacyData) - - appGroupDefaults()?.removeObject(forKey: completeKey) - #expect(!LegacyDataMigrator.hasRecoverableLegacyData) - } - @Test("Auto-migration marks the completion flag even when no store is on disk") func autoMigrationMarksCompleteWhenNoStore() throws { clearFlag() @@ -186,4 +171,156 @@ struct LegacyDataMigratorTests { #expect(!result) #expect(appGroupDefaults()?.bool(forKey: completeKey) == true) } + + // MARK: - WAL sidecar recovery + + @Test("extractPhrases pulls TEXT records out of synthetic WAL frames") + func walParserExtractsTextRecords() throws { + let wal = makeSyntheticWAL(phrases: [ + "meeting at ___", + "OOO today", + "on my way" + ]) + let walURL = try writeTempFile(wal, suffix: ".sqlite-wal") + defer { try? FileManager.default.removeItem(at: walURL) } + + let phrases = WALSidecarRecovery.extractPhrases(from: walURL) + #expect(phrases.contains("meeting at ___")) + #expect(phrases.contains("OOO today")) + #expect(phrases.contains("on my way")) + } + + @Test("extractPhrases drops too-short strings and Core Data internals") + func walParserFiltersNoise() throws { + // "a" → too short (dropped), "Z_PRIMARYKEY" / "Z_METADATA blob" → Core Data + // internals (dropped), "real ditto phrase" → kept. + let wal = makeSyntheticWAL(phrases: [ + "a", + "Z_PRIMARYKEY", + "Z_METADATA blob", + "real ditto phrase" + ]) + let walURL = try writeTempFile(wal, suffix: ".sqlite-wal") + defer { try? FileManager.default.removeItem(at: walURL) } + + let phrases = WALSidecarRecovery.extractPhrases(from: walURL) + #expect(phrases.contains("real ditto phrase")) + #expect(!phrases.contains("a")) + #expect(!phrases.contains("Z_PRIMARYKEY")) + #expect(!phrases.contains("Z_METADATA blob")) + } + + @Test("extractPhrases returns empty for a file without WAL magic") + func walParserRejectsGarbage() throws { + let walURL = try writeTempFile(Data(repeating: 0xFF, count: 200), suffix: ".sqlite-wal") + defer { try? FileManager.default.removeItem(at: walURL) } + + #expect(WALSidecarRecovery.extractPhrases(from: walURL).isEmpty) + } + + @Test("extractPhrases deduplicates repeated phrases across frames") + func walParserDeduplicates() throws { + let wal = makeSyntheticWAL(phrases: ["duplicate phrase", "duplicate phrase", "unique phrase"]) + let walURL = try writeTempFile(wal, suffix: ".sqlite-wal") + defer { try? FileManager.default.removeItem(at: walURL) } + + let phrases = WALSidecarRecovery.extractPhrases(from: walURL) + #expect(phrases.filter { $0 == "duplicate phrase" }.count == 1) + #expect(phrases.contains("unique phrase")) + } + + // MARK: - Synthetic WAL builder + + /// Writes `data` to a uniquely-named temp file with the given suffix and returns its URL. + private func writeTempFile(_ data: Data, suffix: String) throws -> URL { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent(UUID().uuidString + suffix) + try data.write(to: url) + return url + } + + /// Builds a minimal valid SQLite WAL file containing a single frame whose page is a + /// table B-tree leaf with one cell per input string. Each cell is a record with one + /// TEXT column. Page size is 4096; the page is page #2 so the parser doesn't apply + /// the 100-byte SQLite-DB-header offset. + /// + /// Strings must be ≤127 bytes after UTF-8 encoding (so all varints fit in 1 byte); + /// that's plenty for ditto-sized phrases and keeps this helper readable. + private func makeSyntheticWAL(phrases: [String]) -> Data { + let pageSize = 4096 + var wal = Data() + + // WAL header (32 bytes, big-endian on disk): + // magic, file format version, page size, checkpoint sequence, salt-1, salt-2, + // checksum-1, checksum-2. + wal.append(contentsOf: [0x37, 0x7F, 0x06, 0x82]) + wal.appendUInt32BE(3_007_000) + wal.appendUInt32BE(UInt32(pageSize)) + wal.appendUInt32BE(0) + wal.appendUInt32BE(0) + wal.appendUInt32BE(0) + wal.appendUInt32BE(0) + wal.appendUInt32BE(0) + + // Frame header (24 bytes): + // page number (>1 so the parser doesn't apply page-1's 100-byte DB-header offset), + // commit size, salt-1, salt-2, checksum-1, checksum-2. + wal.appendUInt32BE(2) + wal.appendUInt32BE(UInt32(phrases.count)) + wal.appendUInt32BE(0) + wal.appendUInt32BE(0) + wal.appendUInt32BE(0) + wal.appendUInt32BE(0) + + // Build cells, placing them at the tail of the page (SQLite cell content area). + var page = [UInt8](repeating: 0, count: pageSize) + var cellOffsets: [Int] = [] + var contentCursor = pageSize + for (i, phrase) in phrases.enumerated() { + let textBytes = Array(phrase.utf8) + precondition(textBytes.count <= 127, "Synthetic builder only supports short strings") + let serialType = UInt8(textBytes.count * 2 + 13) + // header_length varint (1 byte) + serial_type varint (1 byte) + let headerLength: UInt8 = 2 + let payloadLength = UInt8(Int(headerLength) + textBytes.count) + let rowid = UInt8(i + 1) + let cell: [UInt8] = [payloadLength, rowid, headerLength, serialType] + textBytes + contentCursor -= cell.count + for (j, byte) in cell.enumerated() { + page[contentCursor + j] = byte + } + cellOffsets.append(contentCursor) + } + + // Page header (8 bytes): type (0x0D = table leaf), first-freeblock offset (0 = none), + // cell count, cell-content-area start, fragmented free byte count. + page[0] = 0x0D + page[1] = 0 + page[2] = 0 + page[3] = UInt8(phrases.count >> 8) + page[4] = UInt8(phrases.count & 0xFF) + let contentStart = UInt16(cellOffsets.last ?? pageSize) + page[5] = UInt8(contentStart >> 8) + page[6] = UInt8(contentStart & 0xFF) + page[7] = 0 + + // Cell pointer array (in rowid/insertion order) + for (i, offset) in cellOffsets.enumerated() { + let ptrPos = 8 + i * 2 + page[ptrPos] = UInt8(offset >> 8) + page[ptrPos + 1] = UInt8(offset & 0xFF) + } + + wal.append(contentsOf: page) + return wal + } +} + +private extension Data { + mutating func appendUInt32BE(_ value: UInt32) { + append(UInt8((value >> 24) & 0xFF)) + append(UInt8((value >> 16) & 0xFF)) + append(UInt8((value >> 8) & 0xFF)) + append(UInt8(value & 0xFF)) + } }