From 37eb20e85a80cc471a4f90e99eb051c8a80fc73f Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 22 May 2026 19:40:43 +0000 Subject: [PATCH 1/4] Always show "Recover Old Dittos" in the menu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The entry point was gated on a legacy SQLite file being present on disk, so users whose 2.x data was destroyed by 3.0.0's cleanup saw no menu at all — exactly the cohort that most needs a clear explanation. runLegacyRecovery() already surfaces a structured result for every case (nothing on disk, found-but-unreadable, empty store, inserted N), so unconditionally showing the button gives those users an explanation instead of a missing UI affordance. https://claude.ai/code/session_019wBT9jw5uskSPiuFSyy7gQ --- Ditto/DittoListView.swift | 26 ++++++++++++------------ Ditto/LegacyDataMigrator.swift | 8 -------- DittoTests/LegacyDataMigratorTests.swift | 15 -------------- 3 files changed, 13 insertions(+), 36 deletions(-) diff --git a/Ditto/DittoListView.swift b/Ditto/DittoListView.swift index cd97f9e..666ad6f 100644 --- a/Ditto/DittoListView.swift +++ b/Ditto/DittoListView.swift @@ -88,20 +88,20 @@ struct DittoListView: View { Label("Set Up Keyboard", systemImage: KeyboardSetupStatus.hasFullAccess ? "keyboard.fill" : "keyboard") } - if LegacyDataMigrator.hasRecoverableLegacyData { - Button { - // Show the preview confirmation if we can read the - // legacy store; otherwise fall straight to the - // attempt-and-report-result flow so the user sees - // *why* recovery failed instead of a missing menu. - if let preview = LegacyDataMigrator.previewRecoverableData() { - legacyRecoveryPreview = preview - } else { - runLegacyRecovery() - } - } label: { - Label("Recover Old Dittos", systemImage: "tray.and.arrow.down") + // Always offer the entry point, even when we can't see a + // legacy store on disk. Users whose 2.x data was destroyed + // by the 3.0.0 cleanup will still tap here looking for it; + // runLegacyRecovery() reports the structured result + // (nothingOnDisk / foundButUnreadable / emptyStore / inserted) + // so they get a real explanation instead of a missing menu. + Button { + if let preview = LegacyDataMigrator.previewRecoverableData() { + legacyRecoveryPreview = preview + } else { + runLegacyRecovery() } + } label: { + Label("Recover Old Dittos", systemImage: "tray.and.arrow.down") } Button { diff --git a/Ditto/LegacyDataMigrator.swift b/Ditto/LegacyDataMigrator.swift index fe17378..3c21507 100644 --- a/Ditto/LegacyDataMigrator.swift +++ b/Ditto/LegacyDataMigrator.swift @@ -52,14 +52,6 @@ enum LegacyDataMigrator { return exists } - /// True if a legacy SQLite file is on disk *anywhere* we know to look, regardless of - /// whether we can actually open it. The "Recover Old Dittos" menu item uses this so - /// users with an unreadable-but-present store still see the entry point — they get a - /// useful error from `recoverNow` instead of a silently-missing menu item. - static var hasRecoverableLegacyData: Bool { - legacyStoreURL != nil - } - /// Snapshot for the confirmation alert. struct RecoveryPreview { let categoryCount: Int diff --git a/DittoTests/LegacyDataMigratorTests.swift b/DittoTests/LegacyDataMigratorTests.swift index d498672..60ba6c1 100644 --- a/DittoTests/LegacyDataMigratorTests.swift +++ b/DittoTests/LegacyDataMigratorTests.swift @@ -161,21 +161,6 @@ struct LegacyDataMigratorTests { #expect(!LegacyDataMigrator.needsMigration) } - @Test("hasRecoverableLegacyData ignores the completion flag") - func hasRecoverableIgnoresFlag() { - clearFlag() - defer { clearFlag() } - - // The unit test sandbox has no App Group container, so legacyStoreURL is nil. - // What we're verifying here is the negative: with no store on disk, - // hasRecoverableLegacyData is false regardless of the flag state. - appGroupDefaults()?.set(true, forKey: completeKey) - #expect(!LegacyDataMigrator.hasRecoverableLegacyData) - - appGroupDefaults()?.removeObject(forKey: completeKey) - #expect(!LegacyDataMigrator.hasRecoverableLegacyData) - } - @Test("Auto-migration marks the completion flag even when no store is on disk") func autoMigrationMarksCompleteWhenNoStore() throws { clearFlag() From a74c207ee98cbf366a718d1fd73f7d118973d282 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 22 May 2026 19:58:36 +0000 Subject: [PATCH 2/4] Recover dittos from orphaned WAL sidecars When 3.0.0's cleanup deleted the main Ditto.sqlite but left -wal behind, the file holds pages that would have been checkpointed to the main DB on next open. We can't reopen them through SQLite (the WAL salt references a header we no longer have), so this parses raw WAL frames and walks SQLite B-tree leaf pages to extract TEXT record values directly. WALSidecarRecovery is a standalone parser exercised by LegacyDataMigrator when recoverNow finds no main .sqlite but does find an orphan -wal. Phrases land in a single "Recovered" category since category structure is unrecoverable from the WAL alone. The confirmation alert surfaces a distinct message for that mode so users know what to expect. Best-effort by construction: WAL checksums aren't verified, overflow chains aren't followed (we don't have the main DB's free-page list), and filtering drops obvious Core Data internals (Z_PRIMARYKEY, etc.) plus too-short/too-long strings. Synthetic-WAL tests exercise the parser against the SQLite file format directly. https://claude.ai/code/session_019wBT9jw5uskSPiuFSyy7gQ --- Ditto.xcodeproj/project.pbxproj | 4 + Ditto/DittoListView.swift | 15 +- Ditto/LegacyDataMigrator.swift | 147 ++++++++++++++++- Ditto/WALSidecarRecovery.swift | 194 +++++++++++++++++++++++ DittoTests/LegacyDataMigratorTests.swift | 143 +++++++++++++++++ 5 files changed, 491 insertions(+), 12 deletions(-) create mode 100644 Ditto/WALSidecarRecovery.swift diff --git a/Ditto.xcodeproj/project.pbxproj b/Ditto.xcodeproj/project.pbxproj index 836a2d3..5bfe819 100644 --- a/Ditto.xcodeproj/project.pbxproj +++ b/Ditto.xcodeproj/project.pbxproj @@ -11,6 +11,7 @@ CC000001AAAA000000000002 /* CloudSyncManager.swift in Sources */ = {isa = PBXBuildFile; fileRef = CC000001AAAA000000000012 /* CloudSyncManager.swift */; }; CC000001AAAA000000000003 /* SubscriptionView.swift in Sources */ = {isa = PBXBuildFile; fileRef = CC000001AAAA000000000013 /* SubscriptionView.swift */; }; CC000001AAAA000000000004 /* LegacyDataMigrator.swift in Sources */ = {isa = PBXBuildFile; fileRef = CC000001AAAA000000000014 /* LegacyDataMigrator.swift */; }; + CC000001AAAA000000000030 /* WALSidecarRecovery.swift in Sources */ = {isa = PBXBuildFile; fileRef = CC000001AAAA000000000031 /* WALSidecarRecovery.swift */; }; CC000001AAAA000000000016 /* DittoImportExport.swift in Sources */ = {isa = PBXBuildFile; fileRef = CC000001AAAA000000000019 /* DittoImportExport.swift */; }; CC000001AAAA00000000001A /* Localizable.xcstrings in Resources */ = {isa = PBXBuildFile; fileRef = CC000001AAAA00000000001B /* Localizable.xcstrings */; }; CC000001AAAA00000000001C /* SyncSettings.swift in Sources */ = {isa = PBXBuildFile; fileRef = CC000001AAAA00000000001D /* SyncSettings.swift */; }; @@ -111,6 +112,7 @@ CC000001AAAA000000000012 /* CloudSyncManager.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CloudSyncManager.swift; sourceTree = ""; }; CC000001AAAA000000000013 /* SubscriptionView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SubscriptionView.swift; sourceTree = ""; }; CC000001AAAA000000000014 /* LegacyDataMigrator.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LegacyDataMigrator.swift; sourceTree = ""; }; + CC000001AAAA000000000031 /* WALSidecarRecovery.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WALSidecarRecovery.swift; sourceTree = ""; }; CC000001AAAA000000000019 /* DittoImportExport.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DittoImportExport.swift; sourceTree = ""; }; CC000001AAAA00000000001B /* Localizable.xcstrings */ = {isa = PBXFileReference; lastKnownFileType = text.json.xcstrings; path = Localizable.xcstrings; sourceTree = ""; }; CC000001AAAA00000000001D /* SyncSettings.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SyncSettings.swift; sourceTree = ""; }; @@ -291,6 +293,7 @@ CC000001AAAA000000000012 /* CloudSyncManager.swift */, CC000001AAAA000000000013 /* SubscriptionView.swift */, CC000001AAAA000000000014 /* LegacyDataMigrator.swift */, + CC000001AAAA000000000031 /* WALSidecarRecovery.swift */, CC000001AAAA000000000019 /* DittoImportExport.swift */, CC000001AAAA00000000001B /* Localizable.xcstrings */, CC000001AAAA00000000001D /* SyncSettings.swift */, @@ -572,6 +575,7 @@ CC000001AAAA000000000002 /* CloudSyncManager.swift in Sources */, CC000001AAAA000000000003 /* SubscriptionView.swift in Sources */, CC000001AAAA000000000004 /* LegacyDataMigrator.swift in Sources */, + CC000001AAAA000000000030 /* WALSidecarRecovery.swift in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/Ditto/DittoListView.swift b/Ditto/DittoListView.swift index 666ad6f..d23a8c2 100644 --- a/Ditto/DittoListView.swift +++ b/Ditto/DittoListView.swift @@ -207,10 +207,17 @@ struct DittoListView: View { } } message: { if let preview = legacyRecoveryPreview { - Text( - // swiftlint:disable:next line_length - "Found \(preview.dittoCount) dittos across \(preview.categoryCount) categories from your previous version of Ditto. Recovering will merge them into your current library; duplicates will be skipped." - ) + if preview.isWALRecovery { + Text( + // swiftlint:disable:next line_length + "Found \(preview.dittoCount) recoverable phrases in a backup file from your previous version of Ditto. The original category structure couldn't be preserved — recovered phrases will be added to a single \"Recovered\" category for you to re-organize." + ) + } else { + Text( + // swiftlint:disable:next line_length + "Found \(preview.dittoCount) dittos across \(preview.categoryCount) categories from your previous version of Ditto. Recovering will merge them into your current library; duplicates will be skipped." + ) + } } } .alert("Recovery Complete", isPresented: .init( diff --git a/Ditto/LegacyDataMigrator.swift b/Ditto/LegacyDataMigrator.swift index 3c21507..ed3d4e5 100644 --- a/Ditto/LegacyDataMigrator.swift +++ b/Ditto/LegacyDataMigrator.swift @@ -24,6 +24,7 @@ import SwiftData /// the model wasn't bundled in the main app, so model-loading silently returned `[]`, /// and the migrator interpreted that as "nothing to migrate, safe to clean up" — destroying /// the user's data. We will never call `removeItem` on the legacy store, even on success. +// swiftlint:disable:next type_body_length @available(iOS, deprecated: 18.0, message: "Remove once all users have migrated from the v2 Core Data store (target: v4.0)") enum LegacyDataMigrator { @@ -56,18 +57,34 @@ enum LegacyDataMigrator { struct RecoveryPreview { let categoryCount: Int let dittoCount: Int + /// True when the only thing on disk is an orphaned WAL sidecar (no main + /// `.sqlite`). Category structure is unrecoverable in this mode — items + /// land in a single "Recovered" bucket — so the UI surfaces a different + /// confirmation message. + let isWALRecovery: Bool } /// Reads the legacy store (without mutating it) and returns how many categories / - /// dittos would be imported. Returns nil if there's no recoverable store on disk - /// or the file is present but unreadable. + /// dittos would be imported. When the main `.sqlite` is missing but a `-wal` + /// sidecar exists, falls back to a WAL-frame extraction preview. Returns nil if + /// nothing is recoverable from either path. static func previewRecoverableData() -> RecoveryPreview? { - guard let url = legacyStoreURL else { return nil } - guard let legacy = try? readLegacyStore(at: url), !legacy.isEmpty else { return nil } - return RecoveryPreview( - categoryCount: legacy.count, - dittoCount: legacy.reduce(0) { $0 + $1.dittos.count } - ) + if let url = legacyStoreURL, + let legacy = try? readLegacyStore(at: url), + !legacy.isEmpty { + return RecoveryPreview( + categoryCount: legacy.count, + dittoCount: legacy.reduce(0) { $0 + $1.dittos.count }, + isWALRecovery: false + ) + } + if let walURL = findOrphanWAL() { + let phrases = WALSidecarRecovery.extractPhrases(from: walURL) + if !phrases.isEmpty { + return RecoveryPreview(categoryCount: 1, dittoCount: phrases.count, isWALRecovery: true) + } + } + return nil } /// Outcome of a manual recovery attempt. Surfaced in the UI so users see *why* nothing @@ -106,8 +123,15 @@ enum LegacyDataMigrator { /// flag. Never deletes the legacy store. Returns a structured result so the UI can /// distinguish "nothing on disk" from "file present but unreadable" from "successfully /// imported N dittos". + /// + /// If the main `.sqlite` is gone but a `-wal` sidecar survives, falls through to a raw + /// WAL-frame extraction that pulls candidate phrases out of the journal directly. + /// Category structure is lost in that mode — phrases land in a "Recovered" bucket. static func recoverNow(into context: ModelContext) -> RecoveryResult { guard let storeURL = legacyStoreURL else { + if let walURL = findOrphanWAL() { + return runWALSidecarRecovery(at: walURL, into: context) + } log.info("recoverNow: no legacy store on disk") logOutcome(source: "manual", outcome: "nothing_on_disk") return .nothingOnDisk @@ -316,6 +340,28 @@ enum LegacyDataMigrator { return nil } + /// Returns the URL of a `.sqlite-wal` file that has no main `.sqlite` alongside it. + /// Distinct from `findOrphanWALOrSHM` — that one's for diagnostic logging and accepts + /// either sidecar; this one only returns `-wal`, the only file that actually carries + /// page data we can extract from. SHM files contain just the WAL index and are useless + /// without the WAL. + private static func findOrphanWAL() -> URL? { + let fm = FileManager.default + guard let groupURL = fm.containerURL(forSecurityApplicationGroupIdentifier: appGroupIdentifier) else { + return nil + } + let names = ["Ditto.sqlite", "ditto.sqlite", "Ditto.SQLite"] + for base in names { + let walURL = groupURL.appendingPathComponent(base + "-wal") + guard fm.fileExists(atPath: walURL.path) else { continue } + let main = groupURL.appendingPathComponent(base) + if !fm.fileExists(atPath: main.path) { + return walURL + } + } + return nil + } + private static func fileSize(at url: URL) -> Int64 { let attrs = try? FileManager.default.attributesOfItem(atPath: url.path) return (attrs?[.size] as? NSNumber)?.int64Value ?? 0 @@ -554,4 +600,89 @@ enum LegacyDataMigrator { private static func markComplete() { UserDefaults(suiteName: appGroupIdentifier)?.set(true, forKey: migrationCompleteKey) } + + // MARK: - WAL sidecar recovery + + /// Title of the bucket category created when WAL-frame recovery succeeds. Surfaced + /// to the user in the confirmation alert so they know where to look. + static let walRecoveryCategoryTitle = "Recovered" + + /// Manual recovery from an orphaned `-wal` file. Pulls text out of WAL frame + /// payloads directly via `WALSidecarRecovery`, then writes the phrases into a + /// single `walRecoveryCategoryTitle` bucket since category structure is unrecoverable. + private static func runWALSidecarRecovery(at walURL: URL, into context: ModelContext) -> RecoveryResult { + log.info("runWALSidecarRecovery: parsing \(walURL.path, privacy: .public) (\(fileSize(at: walURL), privacy: .public) bytes)") + let phrases = WALSidecarRecovery.extractPhrases(from: walURL) + guard !phrases.isEmpty else { + log.info("runWALSidecarRecovery: WAL parsed but yielded no recoverable phrases") + logOutcome(source: "manual_wal", outcome: "empty_store") + return .emptyStore + } + + let beforeCount = (try? context.fetch(FetchDescriptor()).count) ?? 0 + writeWALPhrases(phrases, into: context) + + do { + try context.save() + } catch { + log.error("runWALSidecarRecovery: save failed: \(error.localizedDescription, privacy: .public)") + logOutcome(source: "manual_wal", outcome: "found_unreadable") + return .foundButUnreadable(error.localizedDescription) + } + + markComplete() + let afterCount = (try? context.fetch(FetchDescriptor()).count) ?? 0 + let inserted = max(0, afterCount - beforeCount) + log.info("runWALSidecarRecovery: inserted \(inserted, privacy: .public) phrases from WAL") + logOutcome( + source: "manual_wal", + outcome: inserted > 0 ? "success" : "no_new_data", + categoriesFound: 1, + dittosFound: phrases.count, + inserted: inserted + ) + return .inserted(inserted) + } + + /// Inserts WAL-extracted phrases into a single "Recovered" category, deduplicating + /// against anything already in that category. + @discardableResult + private static func writeWALPhrases(_ phrases: [String], into context: ModelContext) -> Int { + let profile: Profile + if let existing = (try? context.fetch(FetchDescriptor()))?.first { + profile = existing + } else { + profile = Profile() + context.insert(profile) + } + + let existingByTitle = Dictionary( + profile.orderedCategories.map { ($0.title, $0) }, + uniquingKeysWith: { first, _ in first } + ) + let category: DittoCategory + if let existing = existingByTitle[walRecoveryCategoryTitle] { + category = existing + } else { + category = DittoCategory(title: walRecoveryCategoryTitle, profile: profile) + category.sortOrder = profile.orderedCategories.count + context.insert(category) + profile.categories?.append(category) + } + + let existingTexts = Set((category.dittos ?? []).map { $0.text }) + var nextSort = (category.dittos ?? []).count + var inserted = 0 + for phrase in phrases { + guard !existingTexts.contains(phrase) else { continue } + let item = DittoItem(text: phrase, category: category) + item.sortOrder = nextSort + nextSort += 1 + context.insert(item) + category.dittos?.append(item) + inserted += 1 + } + log.info("writeWALPhrases: inserted \(inserted, privacy: .public) phrases into '\(walRecoveryCategoryTitle, privacy: .public)'") + return inserted + } } diff --git a/Ditto/WALSidecarRecovery.swift b/Ditto/WALSidecarRecovery.swift new file mode 100644 index 0000000..16fea44 --- /dev/null +++ b/Ditto/WALSidecarRecovery.swift @@ -0,0 +1,194 @@ +import Foundation +import OSLog + +/// Last-resort recovery from an orphaned SQLite `-wal` file whose main `.sqlite` was +/// deleted by the 3.0.0 cleanup. +/// +/// The WAL contains pages that would have been merged back into the main DB on the next +/// checkpoint. We can't reopen them through SQLite (the WAL's salt values reference a +/// DB header we no longer have), so this parses the WAL frames directly and walks the +/// SQLite B-tree leaf pages within them to extract TEXT-typed record values. +/// +/// Format references: +/// - WAL: https://www.sqlite.org/walformat.html +/// - DB pages / records: https://www.sqlite.org/fileformat.html +/// +/// Recovery is best-effort by construction: WAL frame checksums are not verified, and +/// cells whose payload spills onto overflow pages are skipped (we don't have the main +/// DB's free-page list, so we can't follow overflow chains). For typical ditto-length +/// strings none of that matters. +enum WALSidecarRecovery { + + private static let log = Logger(subsystem: "io.kern.ditto", category: "WALSidecarRecovery") + + /// Extracts deduplicated TEXT-typed record values from table B-tree leaf pages + /// within `walURL`. Returned strings are sorted, trimmed, and filtered to drop: + /// - very short strings (`< 2` chars) — pure noise + /// - very long strings (`> 5000` chars) — almost certainly mis-parsed bytes + /// - obvious Core Data internal identifiers (`Z_PRIMARYKEY`, etc.) + static func extractPhrases(from walURL: URL) -> [String] { + guard let data = try? Data(contentsOf: walURL) else { + log.error("extractPhrases: could not read \(walURL.path, privacy: .public)") + return [] + } + return parse([UInt8](data)) + } + + /// Same as `extractPhrases(from:)` but takes raw bytes. Internal entry point so + /// tests can drive the parser against synthetic data without touching disk. + static func parse(_ bytes: [UInt8]) -> [String] { + guard bytes.count >= 32 else { return [] } + // WAL magic: 0x377F0682 (host-endian write) or 0x377F0683 (byte-swapped). + // The header is always stored big-endian on disk regardless. + let magic = readUInt32BE(bytes, at: 0) + guard magic == 0x377F_0682 || magic == 0x377F_0683 else { return [] } + let pageSize = Int(readUInt32BE(bytes, at: 8)) + // SQLite page sizes are powers of two between 512 and 65536. + guard pageSize >= 512, pageSize <= 65536, pageSize.nonzeroBitCount == 1 else { return [] } + + var results: Set = [] + let frameSize = 24 + pageSize + var offset = 32 // past WAL header + while offset + frameSize <= bytes.count { + let pageNumber = readUInt32BE(bytes, at: offset) + let pageBase = offset + 24 + // Page 1 has the 100-byte SQLite DB header before its B-tree header. + let btreeStart = pageNumber == 1 ? 100 : 0 + extractFromPage(bytes, pageBase: pageBase, pageSize: pageSize, btreeStart: btreeStart, into: &results) + offset += frameSize + } + return results.sorted() + } + + // MARK: - B-tree page walker + + private static func extractFromPage( + _ bytes: [UInt8], + pageBase: Int, + pageSize: Int, + btreeStart: Int, + into results: inout Set + ) { + let pageEnd = pageBase + pageSize + let header = pageBase + btreeStart + guard header < bytes.count, header < pageEnd else { return } + // Only table B-tree leaf pages (0x0D) carry the row payloads we want. Index + // pages, interior pages, and freelist pages don't contain user TEXT. + guard bytes[header] == 0x0D else { return } + + let cellCount = Int(readUInt16BE(bytes, at: header + 3)) + // Table-leaf page header is 8 bytes; cell pointer array follows. + let cellPtrStart = header + 8 + for i in 0..= pageBase, cellAbs < pageEnd else { continue } + parseTableLeafCell(bytes, at: cellAbs, pageEnd: pageEnd, into: &results) + } + } + + private static func parseTableLeafCell( + _ bytes: [UInt8], + at offset: Int, + pageEnd: Int, + into results: inout Set + ) { + var cursor = offset + guard let (payloadLength, n1) = readVarint(bytes, at: cursor) else { return } + cursor += n1 + guard let (_, n2) = readVarint(bytes, at: cursor) else { return } + cursor += n2 + + let payloadEnd = cursor + Int(payloadLength) + // If the payload extends past the page, this cell uses overflow pages. + // Skip — extracting from overflow without the main DB's free-page map is + // not worth the complexity for typical ditto-sized text. + guard payloadEnd <= pageEnd, payloadEnd <= bytes.count else { return } + + let recordStart = cursor + guard let (headerLength, hLen) = readVarint(bytes, at: cursor) else { return } + var headerCursor = cursor + hLen + let headerEnd = recordStart + Int(headerLength) + guard headerEnd <= payloadEnd else { return } + + var serials: [UInt64] = [] + while headerCursor < headerEnd { + guard let (st, sz) = readVarint(bytes, at: headerCursor) else { return } + serials.append(st) + headerCursor += sz + } + + var body = headerEnd + for st in serials { + let info = serialTypeInfo(st) + guard body + info.size <= payloadEnd else { return } + if info.isText && info.size > 0 { + let slice = Array(bytes[body..<(body + info.size)]) + if let raw = String(bytes: slice, encoding: .utf8) { + let trimmed = raw.trimmingCharacters(in: .whitespacesAndNewlines) + if trimmed.count >= 2, trimmed.count <= 5000, !isInternalString(trimmed) { + results.insert(trimmed) + } + } + } + body += info.size + } + } + + private static func serialTypeInfo(_ st: UInt64) -> (size: Int, isText: Bool) { + switch st { + case 0, 8, 9, 10, 11: return (0, false) + case 1: return (1, false) + case 2: return (2, false) + case 3: return (3, false) + case 4: return (4, false) + case 5: return (6, false) + case 6, 7: return (8, false) + default: + guard st >= 12 else { return (0, false) } + return (Int((st - 12) / 2), st % 2 == 1) + } + } + + private static func isInternalString(_ s: String) -> Bool { + // Core Data's Z_PRIMARYKEY / Z_METADATA / Z_MODELCACHE tables store entity + // names and serialized schema state as TEXT — we don't want to surface those + // alongside the user's actual ditto phrases. + let internalPrefixes = ["Z_PRIMARYKEY", "Z_METADATA", "Z_MODELCACHE", "NSStoreType"] + return internalPrefixes.contains { s.hasPrefix($0) } + } + + // MARK: - Byte readers + + private static func readUInt16BE(_ bytes: [UInt8], at offset: Int) -> UInt16 { + guard offset + 1 < bytes.count else { return 0 } + return UInt16(bytes[offset]) << 8 | UInt16(bytes[offset + 1]) + } + + private static func readUInt32BE(_ bytes: [UInt8], at offset: Int) -> UInt32 { + guard offset + 3 < bytes.count else { return 0 } + return UInt32(bytes[offset]) << 24 + | UInt32(bytes[offset + 1]) << 16 + | UInt32(bytes[offset + 2]) << 8 + | UInt32(bytes[offset + 3]) + } + + private static func readVarint(_ bytes: [UInt8], at offset: Int) -> (value: UInt64, bytesRead: Int)? { + var result: UInt64 = 0 + for i in 0..<9 { + guard offset + i < bytes.count else { return nil } + let byte = bytes[offset + i] + if i == 8 { + result = (result << 8) | UInt64(byte) + return (result, 9) + } + result = (result << 7) | UInt64(byte & 0x7F) + if byte & 0x80 == 0 { + return (result, i + 1) + } + } + return nil + } +} diff --git a/DittoTests/LegacyDataMigratorTests.swift b/DittoTests/LegacyDataMigratorTests.swift index 60ba6c1..7b948ab 100644 --- a/DittoTests/LegacyDataMigratorTests.swift +++ b/DittoTests/LegacyDataMigratorTests.swift @@ -171,4 +171,147 @@ struct LegacyDataMigratorTests { #expect(!result) #expect(appGroupDefaults()?.bool(forKey: completeKey) == true) } + + // MARK: - WAL sidecar recovery + + @Test("extractPhrases pulls TEXT records out of synthetic WAL frames") + func walParserExtractsTextRecords() throws { + let wal = makeSyntheticWAL(phrases: [ + "meeting at ___", + "OOO today", + "on my way" + ]) + let walURL = try writeTempFile(wal, suffix: ".sqlite-wal") + defer { try? FileManager.default.removeItem(at: walURL) } + + let phrases = WALSidecarRecovery.extractPhrases(from: walURL) + #expect(phrases.contains("meeting at ___")) + #expect(phrases.contains("OOO today")) + #expect(phrases.contains("on my way")) + } + + @Test("extractPhrases drops too-short strings and Core Data internals") + func walParserFiltersNoise() throws { + let wal = makeSyntheticWAL(phrases: [ + "a", // 1 char → dropped + "Z_PRIMARYKEY", // Core Data internal → dropped + "Z_METADATA blob", // Core Data internal prefix → dropped + "real ditto phrase" // user content → kept + ]) + let walURL = try writeTempFile(wal, suffix: ".sqlite-wal") + defer { try? FileManager.default.removeItem(at: walURL) } + + let phrases = WALSidecarRecovery.extractPhrases(from: walURL) + #expect(phrases.contains("real ditto phrase")) + #expect(!phrases.contains("a")) + #expect(!phrases.contains("Z_PRIMARYKEY")) + #expect(!phrases.contains("Z_METADATA blob")) + } + + @Test("extractPhrases returns empty for a file without WAL magic") + func walParserRejectsGarbage() throws { + let walURL = try writeTempFile(Data(repeating: 0xFF, count: 200), suffix: ".sqlite-wal") + defer { try? FileManager.default.removeItem(at: walURL) } + + #expect(WALSidecarRecovery.extractPhrases(from: walURL).isEmpty) + } + + @Test("extractPhrases deduplicates repeated phrases across frames") + func walParserDeduplicates() throws { + let wal = makeSyntheticWAL(phrases: ["duplicate phrase", "duplicate phrase", "unique phrase"]) + let walURL = try writeTempFile(wal, suffix: ".sqlite-wal") + defer { try? FileManager.default.removeItem(at: walURL) } + + let phrases = WALSidecarRecovery.extractPhrases(from: walURL) + #expect(phrases.filter { $0 == "duplicate phrase" }.count == 1) + #expect(phrases.contains("unique phrase")) + } + + // MARK: - Synthetic WAL builder + + /// Writes `data` to a uniquely-named temp file with the given suffix and returns its URL. + private func writeTempFile(_ data: Data, suffix: String) throws -> URL { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent(UUID().uuidString + suffix) + try data.write(to: url) + return url + } + + /// Builds a minimal valid SQLite WAL file containing a single frame whose page is a + /// table B-tree leaf with one cell per input string. Each cell is a record with one + /// TEXT column. Page size is 4096; the page is page #2 so the parser doesn't apply + /// the 100-byte SQLite-DB-header offset. + /// + /// Strings must be ≤127 bytes after UTF-8 encoding (so all varints fit in 1 byte); + /// that's plenty for ditto-sized phrases and keeps this helper readable. + private func makeSyntheticWAL(phrases: [String]) -> Data { + let pageSize = 4096 + var wal = Data() + + // WAL header (32 bytes, big-endian on disk) + wal.append(contentsOf: [0x37, 0x7F, 0x06, 0x82]) // magic + wal.appendUInt32BE(3_007_000) // file format version + wal.appendUInt32BE(UInt32(pageSize)) // page size + wal.appendUInt32BE(0) // checkpoint sequence + wal.appendUInt32BE(0) // salt-1 + wal.appendUInt32BE(0) // salt-2 + wal.appendUInt32BE(0) // checksum-1 + wal.appendUInt32BE(0) // checksum-2 + + // Frame header (24 bytes) + wal.appendUInt32BE(2) // page number (>1 to skip DB header) + wal.appendUInt32BE(UInt32(phrases.count)) // commit size + wal.appendUInt32BE(0) // salt-1 + wal.appendUInt32BE(0) // salt-2 + wal.appendUInt32BE(0) // checksum-1 + wal.appendUInt32BE(0) // checksum-2 + + // Build cells, placing them at the tail of the page (SQLite cell content area). + var page = [UInt8](repeating: 0, count: pageSize) + var cellOffsets: [Int] = [] + var contentCursor = pageSize + for (i, phrase) in phrases.enumerated() { + let textBytes = Array(phrase.utf8) + precondition(textBytes.count <= 127, "Synthetic builder only supports short strings") + let serialType = UInt8(textBytes.count * 2 + 13) + let headerLength: UInt8 = 2 // header_length varint + serial_type varint + let payloadLength = UInt8(Int(headerLength) + textBytes.count) + let rowid = UInt8(i + 1) + let cell: [UInt8] = [payloadLength, rowid, headerLength, serialType] + textBytes + contentCursor -= cell.count + for (j, byte) in cell.enumerated() { + page[contentCursor + j] = byte + } + cellOffsets.append(contentCursor) + } + + // Page header (8 bytes) + page[0] = 0x0D // table leaf + page[1] = 0; page[2] = 0 // first freeblock = none + page[3] = UInt8(phrases.count >> 8) + page[4] = UInt8(phrases.count & 0xFF) // cell count + let contentStart = UInt16(cellOffsets.last ?? pageSize) + page[5] = UInt8(contentStart >> 8) + page[6] = UInt8(contentStart & 0xFF) + page[7] = 0 // fragmented free bytes + + // Cell pointer array (in rowid/insertion order) + for (i, offset) in cellOffsets.enumerated() { + let ptrPos = 8 + i * 2 + page[ptrPos] = UInt8(offset >> 8) + page[ptrPos + 1] = UInt8(offset & 0xFF) + } + + wal.append(contentsOf: page) + return wal + } +} + +private extension Data { + mutating func appendUInt32BE(_ value: UInt32) { + append(UInt8((value >> 24) & 0xFF)) + append(UInt8((value >> 16) & 0xFF)) + append(UInt8((value >> 8) & 0xFF)) + append(UInt8(value & 0xFF)) + } } From 7f1df81eadec3ba8a387aded523c6aad860f7922 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 22 May 2026 20:28:02 +0000 Subject: [PATCH 3/4] Fix SwiftLint/SwiftFormat findings on the recovery flow - file_length/type_body_length: LegacyDataMigrator legitimately concentrates discovery/read/write/telemetry plus the orphan-WAL fallback in one place because they share App-Group state and the outcome telemetry pipeline. Disable both rules at the file level with a comment explaining why. - consecutiveSpaces: drop the vertical alignment on inline comments in the synthetic-WAL test helper; SwiftFormat collapses those. https://claude.ai/code/session_019wBT9jw5uskSPiuFSyy7gQ --- Ditto/LegacyDataMigrator.swift | 8 ++- DittoTests/LegacyDataMigratorTests.swift | 63 ++++++++++++++---------- 2 files changed, 43 insertions(+), 28 deletions(-) diff --git a/Ditto/LegacyDataMigrator.swift b/Ditto/LegacyDataMigrator.swift index ed3d4e5..befe949 100644 --- a/Ditto/LegacyDataMigrator.swift +++ b/Ditto/LegacyDataMigrator.swift @@ -1,3 +1,10 @@ +// swiftlint:disable file_length type_body_length +// +// This type concentrates discovery / read / write / telemetry for the v2 Core Data +// migration AND the orphan-WAL fallback in one place because they share state +// (App Group resolution, completion flag, logging category, outcome telemetry). +// Splitting them would force that state to be passed around or duplicated. + import CoreData import Foundation import OSLog @@ -24,7 +31,6 @@ import SwiftData /// the model wasn't bundled in the main app, so model-loading silently returned `[]`, /// and the migrator interpreted that as "nothing to migrate, safe to clean up" — destroying /// the user's data. We will never call `removeItem` on the legacy store, even on success. -// swiftlint:disable:next type_body_length @available(iOS, deprecated: 18.0, message: "Remove once all users have migrated from the v2 Core Data store (target: v4.0)") enum LegacyDataMigrator { diff --git a/DittoTests/LegacyDataMigratorTests.swift b/DittoTests/LegacyDataMigratorTests.swift index 7b948ab..2029a57 100644 --- a/DittoTests/LegacyDataMigratorTests.swift +++ b/DittoTests/LegacyDataMigratorTests.swift @@ -192,11 +192,13 @@ struct LegacyDataMigratorTests { @Test("extractPhrases drops too-short strings and Core Data internals") func walParserFiltersNoise() throws { + // "a" → too short (dropped), "Z_PRIMARYKEY" / "Z_METADATA blob" → Core Data + // internals (dropped), "real ditto phrase" → kept. let wal = makeSyntheticWAL(phrases: [ - "a", // 1 char → dropped - "Z_PRIMARYKEY", // Core Data internal → dropped - "Z_METADATA blob", // Core Data internal prefix → dropped - "real ditto phrase" // user content → kept + "a", + "Z_PRIMARYKEY", + "Z_METADATA blob", + "real ditto phrase" ]) let walURL = try writeTempFile(wal, suffix: ".sqlite-wal") defer { try? FileManager.default.removeItem(at: walURL) } @@ -248,23 +250,27 @@ struct LegacyDataMigratorTests { let pageSize = 4096 var wal = Data() - // WAL header (32 bytes, big-endian on disk) - wal.append(contentsOf: [0x37, 0x7F, 0x06, 0x82]) // magic - wal.appendUInt32BE(3_007_000) // file format version - wal.appendUInt32BE(UInt32(pageSize)) // page size - wal.appendUInt32BE(0) // checkpoint sequence - wal.appendUInt32BE(0) // salt-1 - wal.appendUInt32BE(0) // salt-2 - wal.appendUInt32BE(0) // checksum-1 - wal.appendUInt32BE(0) // checksum-2 - - // Frame header (24 bytes) - wal.appendUInt32BE(2) // page number (>1 to skip DB header) - wal.appendUInt32BE(UInt32(phrases.count)) // commit size - wal.appendUInt32BE(0) // salt-1 - wal.appendUInt32BE(0) // salt-2 - wal.appendUInt32BE(0) // checksum-1 - wal.appendUInt32BE(0) // checksum-2 + // WAL header (32 bytes, big-endian on disk): + // magic, file format version, page size, checkpoint sequence, salt-1, salt-2, + // checksum-1, checksum-2. + wal.append(contentsOf: [0x37, 0x7F, 0x06, 0x82]) + wal.appendUInt32BE(3_007_000) + wal.appendUInt32BE(UInt32(pageSize)) + wal.appendUInt32BE(0) + wal.appendUInt32BE(0) + wal.appendUInt32BE(0) + wal.appendUInt32BE(0) + wal.appendUInt32BE(0) + + // Frame header (24 bytes): + // page number (>1 so the parser doesn't apply page-1's 100-byte DB-header offset), + // commit size, salt-1, salt-2, checksum-1, checksum-2. + wal.appendUInt32BE(2) + wal.appendUInt32BE(UInt32(phrases.count)) + wal.appendUInt32BE(0) + wal.appendUInt32BE(0) + wal.appendUInt32BE(0) + wal.appendUInt32BE(0) // Build cells, placing them at the tail of the page (SQLite cell content area). var page = [UInt8](repeating: 0, count: pageSize) @@ -274,7 +280,8 @@ struct LegacyDataMigratorTests { let textBytes = Array(phrase.utf8) precondition(textBytes.count <= 127, "Synthetic builder only supports short strings") let serialType = UInt8(textBytes.count * 2 + 13) - let headerLength: UInt8 = 2 // header_length varint + serial_type varint + // header_length varint (1 byte) + serial_type varint (1 byte) + let headerLength: UInt8 = 2 let payloadLength = UInt8(Int(headerLength) + textBytes.count) let rowid = UInt8(i + 1) let cell: [UInt8] = [payloadLength, rowid, headerLength, serialType] + textBytes @@ -285,15 +292,17 @@ struct LegacyDataMigratorTests { cellOffsets.append(contentCursor) } - // Page header (8 bytes) - page[0] = 0x0D // table leaf - page[1] = 0; page[2] = 0 // first freeblock = none + // Page header (8 bytes): type (0x0D = table leaf), first-freeblock offset (0 = none), + // cell count, cell-content-area start, fragmented free byte count. + page[0] = 0x0D + page[1] = 0 + page[2] = 0 page[3] = UInt8(phrases.count >> 8) - page[4] = UInt8(phrases.count & 0xFF) // cell count + page[4] = UInt8(phrases.count & 0xFF) let contentStart = UInt16(cellOffsets.last ?? pageSize) page[5] = UInt8(contentStart >> 8) page[6] = UInt8(contentStart & 0xFF) - page[7] = 0 // fragmented free bytes + page[7] = 0 // Cell pointer array (in rowid/insertion order) for (i, offset) in cellOffsets.enumerated() { From a5557fdb88fd73c59c5edde1b4b3c22b28a82e8c Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 22 May 2026 20:30:54 +0000 Subject: [PATCH 4/4] Address SwiftLint findings on the WAL recovery code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - blanket_disable_command on LegacyDataMigrator.swift: add a matching swiftlint:enable for the file_length / type_body_length pair so the disable isn't blanket. - cyclomatic_complexity on parseTableLeafCell: split out parseRecordHeader, extractTexts, and decodeKeepableText. Behavior is identical; complexity per function drops from 12 to ≤6. https://claude.ai/code/session_019wBT9jw5uskSPiuFSyy7gQ --- Ditto/LegacyDataMigrator.swift | 2 ++ Ditto/WALSidecarRecovery.swift | 60 +++++++++++++++++++++++++--------- 2 files changed, 46 insertions(+), 16 deletions(-) diff --git a/Ditto/LegacyDataMigrator.swift b/Ditto/LegacyDataMigrator.swift index befe949..f0b56be 100644 --- a/Ditto/LegacyDataMigrator.swift +++ b/Ditto/LegacyDataMigrator.swift @@ -692,3 +692,5 @@ enum LegacyDataMigrator { return inserted } } + +// swiftlint:enable file_length type_body_length diff --git a/Ditto/WALSidecarRecovery.swift b/Ditto/WALSidecarRecovery.swift index 16fea44..0195d26 100644 --- a/Ditto/WALSidecarRecovery.swift +++ b/Ditto/WALSidecarRecovery.swift @@ -101,42 +101,70 @@ enum WALSidecarRecovery { guard let (_, n2) = readVarint(bytes, at: cursor) else { return } cursor += n2 - let payloadEnd = cursor + Int(payloadLength) // If the payload extends past the page, this cell uses overflow pages. // Skip — extracting from overflow without the main DB's free-page map is // not worth the complexity for typical ditto-sized text. + let payloadEnd = cursor + Int(payloadLength) guard payloadEnd <= pageEnd, payloadEnd <= bytes.count else { return } - let recordStart = cursor - guard let (headerLength, hLen) = readVarint(bytes, at: cursor) else { return } - var headerCursor = cursor + hLen - let headerEnd = recordStart + Int(headerLength) - guard headerEnd <= payloadEnd else { return } + guard let (serials, bodyStart) = parseRecordHeader(bytes, at: cursor, payloadEnd: payloadEnd) else { return } + extractTexts(bytes, serials: serials, bodyStart: bodyStart, payloadEnd: payloadEnd, into: &results) + } + + /// Parses the record header at `offset`, returning the list of serial-type codes + /// and the offset where the record body starts. Returns nil if the header is + /// truncated or doesn't fit in the payload. + private static func parseRecordHeader( + _ bytes: [UInt8], + at offset: Int, + payloadEnd: Int + ) -> (serials: [UInt64], bodyStart: Int)? { + guard let (headerLength, hLen) = readVarint(bytes, at: offset) else { return nil } + var headerCursor = offset + hLen + let headerEnd = offset + Int(headerLength) + guard headerEnd <= payloadEnd else { return nil } var serials: [UInt64] = [] while headerCursor < headerEnd { - guard let (st, sz) = readVarint(bytes, at: headerCursor) else { return } + guard let (st, sz) = readVarint(bytes, at: headerCursor) else { return nil } serials.append(st) headerCursor += sz } + return (serials, headerEnd) + } - var body = headerEnd + /// Walks the record body once per serial type, decoding TEXT-typed fields and + /// inserting the keepable ones into `results`. Stops if any field would read + /// past `payloadEnd`. + private static func extractTexts( + _ bytes: [UInt8], + serials: [UInt64], + bodyStart: Int, + payloadEnd: Int, + into results: inout Set + ) { + var body = bodyStart for st in serials { let info = serialTypeInfo(st) guard body + info.size <= payloadEnd else { return } - if info.isText && info.size > 0 { - let slice = Array(bytes[body..<(body + info.size)]) - if let raw = String(bytes: slice, encoding: .utf8) { - let trimmed = raw.trimmingCharacters(in: .whitespacesAndNewlines) - if trimmed.count >= 2, trimmed.count <= 5000, !isInternalString(trimmed) { - results.insert(trimmed) - } - } + if info.isText, info.size > 0, let text = decodeKeepableText(bytes, at: body, size: info.size) { + results.insert(text) } body += info.size } } + /// Decodes `size` bytes at `offset` as UTF-8, trims whitespace, and returns the + /// string only if it passes the keepable-content filter (length bounds + not a + /// known Core Data internal identifier). + private static func decodeKeepableText(_ bytes: [UInt8], at offset: Int, size: Int) -> String? { + let slice = Array(bytes[offset..<(offset + size)]) + guard let raw = String(bytes: slice, encoding: .utf8) else { return nil } + let trimmed = raw.trimmingCharacters(in: .whitespacesAndNewlines) + guard trimmed.count >= 2, trimmed.count <= 5000, !isInternalString(trimmed) else { return nil } + return trimmed + } + private static func serialTypeInfo(_ st: UInt64) -> (size: Int, isText: Bool) { switch st { case 0, 8, 9, 10, 11: return (0, false)