From b67e4c4a0afaa59470744a8106be8713b1cf1c59 Mon Sep 17 00:00:00 2001 From: Micah Alpern Date: Wed, 10 Jun 2026 23:02:23 -0700 Subject: [PATCH 1/3] Stuck-key incident capture: add stdout tail + system load, document GUI-only gap (MAL-57) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 2026-06-10 MAL-57 incidents exposed three gaps in the stuck-key incident capture: 1. Captures snapshotted kanata stderr, but the pqrs driver-connection evidence (connected / driver connected: false / output backend unavailable / dropping KEY_* Release) lives in stdout. Add a stdout tail (last 200 lines) filtering the high-volume "virtual_hid_keyboard_ready true" heartbeat; "false" lines are kept since they indicate driver disconnects. 2. No system-load context despite CPU starvation being the leading hypothesis. Add loadavg (getloadavg) plus the top 5 CPU-consuming processes via ps; process listing is skipped under tests. 3. The detector only runs while KeyPath.app is open — both 2026-06-10 incidents were missed because the GUI was closed. Documented on the service and in docs/bugs/MAL-57-duplicate-keypresses.md. Log tails now read only the trailing 1MB of the file instead of loading it whole (kanata stdout grows to 175MB+). Co-Authored-By: Claude Fable 5 --- .../Monitoring/StuckKeyRecoveryService.swift | 107 ++++++++++++++++-- .../StuckKeyRecoveryServiceTests.swift | 49 ++++++++ docs/bugs/MAL-57-duplicate-keypresses.md | 27 +++++ 3 files changed, 176 insertions(+), 7 deletions(-) diff --git a/Sources/KeyPathAppKit/Services/Monitoring/StuckKeyRecoveryService.swift b/Sources/KeyPathAppKit/Services/Monitoring/StuckKeyRecoveryService.swift index b01c05ee8..35a1867d6 100644 --- a/Sources/KeyPathAppKit/Services/Monitoring/StuckKeyRecoveryService.swift +++ b/Sources/KeyPathAppKit/Services/Monitoring/StuckKeyRecoveryService.swift @@ -7,6 +7,11 @@ import KeyPathCore /// event, causing macOS to generate infinite autorepeat. This service monitors for that /// condition via AutorepeatMismatch correlations and triggers a kanata restart to clear the /// stale virtual HID state (kanata's startup F24 flush handles the actual key release). +/// +/// LIMITATION (MAL-57): This detector lives in the GUI app and only runs while KeyPath.app +/// is open. Stuck-key incidents that occur while the GUI is closed are neither recovered nor +/// captured — both 2026-06-10 MAL-57 incidents were missed for this reason. Daemon-side +/// detection would be needed to close that gap. @MainActor final class StuckKeyRecoveryService { static let shared = StuckKeyRecoveryService() @@ -127,14 +132,24 @@ final class StuckKeyRecoveryService { lines.append("session_id: \(snapshot.tracker.sessionID.map(String.init) ?? "none")") lines.append("") + lines.append("=== System Load ===") + lines.append(contentsOf: systemLoadLines()) + lines.append("") + lines.append("=== Kanata Stderr (last 50 lines) ===") - if let data = FileManager.default.contents(atPath: "/var/log/com.keypath.kanata.stderr.log"), - let content = String(data: data, encoding: .utf8) - { - lines.append(contentsOf: content.components(separatedBy: .newlines).suffix(50)) - } else { - lines.append("(file not readable)") - } + lines.append(contentsOf: logTail(path: "/var/log/com.keypath.kanata.stderr.log", maxLines: 50)) + lines.append("") + + // The pqrs driver-connection evidence (connected / driver connected / output backend + // unavailable / dropping KEY_* Release) goes to stdout, not stderr. The high-volume + // "virtual_hid_keyboard_ready true" heartbeat is filtered out; "... false" lines are + // kept because they indicate driver disconnects. + lines.append("=== Kanata Stdout (last 200 lines, vhid-ready spam filtered) ===") + lines.append(contentsOf: logTail( + path: "/var/log/com.keypath.kanata.stdout.log", + maxLines: 200, + excludingLinesContaining: "virtual_hid_keyboard_ready true" + )) lines.append("") lines.append("=== Recovery History ===") @@ -169,6 +184,84 @@ final class StuckKeyRecoveryService { } } } + + // MARK: - Capture Helpers + + /// Last `maxLines` lines of a log file, optionally dropping lines that contain `filter`. + /// Reads only the trailing chunk of the file — kanata's stdout log grows to hundreds of MB. + nonisolated static func logTail( + path: String, + maxLines: Int, + excludingLinesContaining filter: String? = nil + ) -> [String] { + guard let handle = FileHandle(forReadingAtPath: path) else { + return ["(file not readable)"] + } + defer { try? handle.close() } + + let maxBytes: UInt64 = 1024 * 1024 + guard let size = try? handle.seekToEnd() else { + return ["(file not readable)"] + } + try? handle.seek(toOffset: size > maxBytes ? size - maxBytes : 0) + guard let data = try? handle.readToEnd(), + let content = String(data: data, encoding: .utf8) + else { + return ["(file not readable)"] + } + + var tail = content.components(separatedBy: .newlines) + if let filter { + tail = tail.filter { !$0.contains(filter) } + } + return Array(tail.suffix(maxLines)) + } + + /// Load average plus the top CPU consumers — the leading MAL-57 hypothesis is + /// CPU-starvation-induced heartbeat misses, so each capture needs load context. + nonisolated static func systemLoadLines() -> [String] { + var lines: [String] = [] + var loads = [Double](repeating: 0, count: 3) + if getloadavg(&loads, 3) == 3 { + lines.append(String(format: "loadavg: %.2f %.2f %.2f", loads[0], loads[1], loads[2])) + } else { + lines.append("loadavg: (unavailable)") + } + lines.append("top_cpu_processes:") + lines.append(contentsOf: topCPUProcessLines(limit: 5)) + return lines + } + + /// Top CPU-consuming processes via `ps`. Skipped under tests — spawning process-listing + /// tools in the test environment can deadlock (see KeyPathTestCase). + nonisolated static func topCPUProcessLines(limit: Int) -> [String] { + guard !TestEnvironment.isRunningTests else { + return [" (skipped in tests)"] + } + + let process = Process() + process.executableURL = URL(fileURLWithPath: "/bin/ps") + process.arguments = ["-Aceo", "pcpu,pid,comm", "-r"] + let pipe = Pipe() + process.standardOutput = pipe + process.standardError = FileHandle.nullDevice + + do { + try process.run() + let data = pipe.fileHandleForReading.readDataToEndOfFile() + process.waitUntilExit() + guard let output = String(data: data, encoding: .utf8) else { + return [" (ps output unreadable)"] + } + let rows = output.components(separatedBy: .newlines) + .dropFirst() // header + .map { $0.trimmingCharacters(in: .whitespaces) } + .filter { !$0.isEmpty } + return rows.prefix(limit).map { " \($0)" } + } catch { + return [" (ps failed: \(error.localizedDescription))"] + } + } } private struct DiagnosticSnapshotData: Sendable { diff --git a/Tests/KeyPathTests/Services/StuckKeyRecoveryServiceTests.swift b/Tests/KeyPathTests/Services/StuckKeyRecoveryServiceTests.swift index 3979084fc..71a462633 100644 --- a/Tests/KeyPathTests/Services/StuckKeyRecoveryServiceTests.swift +++ b/Tests/KeyPathTests/Services/StuckKeyRecoveryServiceTests.swift @@ -134,6 +134,55 @@ final class StuckKeyRecoveryServiceTests: KeyPathTestCase { XCTAssertEqual(restartCallCount, 1, "Should not fire second restart while first is in progress") } + // MARK: - Capture Helpers + + func testLogTailFiltersSpamAndLimitsLines() throws { + let fileURL = FileManager.default.temporaryDirectory + .appendingPathComponent("stuck-key-logtail-\(UUID().uuidString).log") + defer { try? FileManager.default.removeItem(at: fileURL) } + + // Interleave useful lines with the heartbeat spam the filter must drop. The + // "false" variant is diagnostic (driver disconnect) and must survive the filter. + var content: [String] = [] + for index in 0 ..< 300 { + content.append("virtual_hid_keyboard_ready true") + content.append("useful line \(index)") + } + content.append("virtual_hid_keyboard_ready false") + content.append("dropping KEY_T Release") + try content.joined(separator: "\n").write(to: fileURL, atomically: true, encoding: .utf8) + + let tail = StuckKeyRecoveryService.logTail( + path: fileURL.path, + maxLines: 200, + excludingLinesContaining: "virtual_hid_keyboard_ready true" + ) + + XCTAssertEqual(tail.count, 200) + XCTAssertFalse(tail.contains("virtual_hid_keyboard_ready true"), "Heartbeat spam should be filtered") + XCTAssertEqual(tail.last, "dropping KEY_T Release", "Most recent lines should be kept") + XCTAssertTrue(tail.contains("virtual_hid_keyboard_ready false"), "Driver-disconnect lines must survive the filter") + } + + func testLogTailHandlesMissingFile() { + let tail = StuckKeyRecoveryService.logTail(path: "/nonexistent/keypath-test.log", maxLines: 50) + XCTAssertEqual(tail, ["(file not readable)"]) + } + + func testSystemLoadLinesSkipProcessListingInTests() { + let lines = StuckKeyRecoveryService.systemLoadLines() + + XCTAssertTrue( + lines.first?.hasPrefix("loadavg: ") == true, + "Load average should always be captured (libc call, safe in tests)" + ) + XCTAssertTrue(lines.contains("top_cpu_processes:")) + XCTAssertTrue( + lines.contains(" (skipped in tests)"), + "Process listing must not shell out under tests" + ) + } + // MARK: - Helpers private func makeCorrelation( diff --git a/docs/bugs/MAL-57-duplicate-keypresses.md b/docs/bugs/MAL-57-duplicate-keypresses.md index af369bff5..0df77a5db 100644 --- a/docs/bugs/MAL-57-duplicate-keypresses.md +++ b/docs/bugs/MAL-57-duplicate-keypresses.md @@ -683,6 +683,33 @@ For the next repro runs, inspect: --- +## 2026-06-10 Incident Evidence + +Two stuck-key/duplicate-keypress incidents occurred on 2026-06-10. Neither produced a +capture file in `~/Library/Logs/KeyPath/stuck-key-incidents/`, and post-hoc analysis had +to be reconstructed from `/var/log/com.keypath.kanata.stdout.log`. Three gaps in the +incident capture (`StuckKeyRecoveryService.writeSnapshotToDisk`) explain why: + +1. **Wrong log captured.** The incident file snapshotted the kanata *stderr* tail, but the + diagnostic evidence lives in *stdout*: pqrs driver/client connection events such as + `connected`, `driver connected: false`, `output backend unavailable during write`, and + `dropping KEY_* Release`. Fixed: captures now include the last 200 lines of stdout with + the high-volume `virtual_hid_keyboard_ready true` heartbeat filtered out (the `false` + variant is kept — it indicates driver disconnects). + +2. **No system-load context.** The leading hypothesis for these incidents is + CPU-starvation-induced heartbeat misses, but captures recorded nothing about load. + Fixed: each capture now includes a `=== System Load ===` section with `loadavg` and the + top 5 CPU-consuming processes at capture time. + +3. **Detector only runs while the GUI is open.** `StuckKeyRecoveryService` lives in + KeyPath.app; both 2026-06-10 incidents were missed entirely because the GUI was closed + at the time — no detection, no recovery, no capture. This remains an open gap: closing + it requires moving (or duplicating) detection into a component that runs whenever the + kanata daemon does (e.g. the LaunchDaemon side or a lightweight agent). + +--- + ## References - `Sources/KeyPathAppKit/Services/KanataTCPClient.swift` - Lines 892-983 From f0552a1161d8dd0344ed7da26eab2573cb3b5e5c Mon Sep 17 00:00:00 2001 From: Micah Alpern Date: Thu, 11 Jun 2026 11:51:03 -0700 Subject: [PATCH 2/3] Review fixes: shared constants, SubprocessRunner for ps, UTF-8-safe log tail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Use KeyPathConstants.Logs.kanataStdout/kanataStderr instead of re-hardcoding the daemon log paths. - Run ps via SubprocessRunner with a 5s timeout instead of raw Process — bounds how long a slow ps (plausible under the CPU starvation being diagnosed) can stall the snapshot that recovery awaits before restarting kanata. - logTail: decode lossily and drop the partial first line of a truncated read; previously a 1MB seek landing mid-multibyte-character made String(data:encoding:) fail and the whole tail degraded to "(file not readable)". Regression test added. Co-Authored-By: Claude Fable 5 --- .../Monitoring/StuckKeyRecoveryService.swift | 61 +++++++++---------- .../StuckKeyRecoveryServiceTests.swift | 24 +++++++- 2 files changed, 52 insertions(+), 33 deletions(-) diff --git a/Sources/KeyPathAppKit/Services/Monitoring/StuckKeyRecoveryService.swift b/Sources/KeyPathAppKit/Services/Monitoring/StuckKeyRecoveryService.swift index 35a1867d6..b8abf63cc 100644 --- a/Sources/KeyPathAppKit/Services/Monitoring/StuckKeyRecoveryService.swift +++ b/Sources/KeyPathAppKit/Services/Monitoring/StuckKeyRecoveryService.swift @@ -95,11 +95,11 @@ final class StuckKeyRecoveryService { ) await Task.detached(priority: .utility) { - Self.writeSnapshotToDisk(snapshot) + await Self.writeSnapshotToDisk(snapshot) }.value } - private nonisolated static func writeSnapshotToDisk(_ snapshot: DiagnosticSnapshotData) { + private nonisolated static func writeSnapshotToDisk(_ snapshot: DiagnosticSnapshotData) async { let diagnosticsDir = FileManager.default.homeDirectoryForCurrentUser .appendingPathComponent("Library/Logs/KeyPath/stuck-key-incidents") @@ -133,11 +133,11 @@ final class StuckKeyRecoveryService { lines.append("") lines.append("=== System Load ===") - lines.append(contentsOf: systemLoadLines()) + await lines.append(contentsOf: systemLoadLines()) lines.append("") lines.append("=== Kanata Stderr (last 50 lines) ===") - lines.append(contentsOf: logTail(path: "/var/log/com.keypath.kanata.stderr.log", maxLines: 50)) + lines.append(contentsOf: logTail(path: KeyPathConstants.Logs.kanataStderr, maxLines: 50)) lines.append("") // The pqrs driver-connection evidence (connected / driver connected / output backend @@ -146,7 +146,7 @@ final class StuckKeyRecoveryService { // kept because they indicate driver disconnects. lines.append("=== Kanata Stdout (last 200 lines, vhid-ready spam filtered) ===") lines.append(contentsOf: logTail( - path: "/var/log/com.keypath.kanata.stdout.log", + path: KeyPathConstants.Logs.kanataStdout, maxLines: 200, excludingLinesContaining: "virtual_hid_keyboard_ready true" )) @@ -194,23 +194,28 @@ final class StuckKeyRecoveryService { maxLines: Int, excludingLinesContaining filter: String? = nil ) -> [String] { + let unreadable = ["(file not readable)"] guard let handle = FileHandle(forReadingAtPath: path) else { - return ["(file not readable)"] + return unreadable } defer { try? handle.close() } let maxBytes: UInt64 = 1024 * 1024 guard let size = try? handle.seekToEnd() else { - return ["(file not readable)"] + return unreadable } - try? handle.seek(toOffset: size > maxBytes ? size - maxBytes : 0) - guard let data = try? handle.readToEnd(), - let content = String(data: data, encoding: .utf8) - else { - return ["(file not readable)"] + let offset = size > maxBytes ? size - maxBytes : 0 + try? handle.seek(toOffset: offset) + guard let data = try? handle.readToEnd() else { + return unreadable } - var tail = content.components(separatedBy: .newlines) + // Lossy decode: a truncated read can start mid-multibyte-character, which would + // make String(data:encoding:) fail for the whole chunk. + var tail = String(decoding: data, as: UTF8.self).components(separatedBy: .newlines) + if offset > 0 { + tail = Array(tail.dropFirst()) // first line of a truncated read is partial + } if let filter { tail = tail.filter { !$0.contains(filter) } } @@ -219,7 +224,7 @@ final class StuckKeyRecoveryService { /// Load average plus the top CPU consumers — the leading MAL-57 hypothesis is /// CPU-starvation-induced heartbeat misses, so each capture needs load context. - nonisolated static func systemLoadLines() -> [String] { + nonisolated static func systemLoadLines() async -> [String] { var lines: [String] = [] var loads = [Double](repeating: 0, count: 3) if getloadavg(&loads, 3) == 3 { @@ -228,32 +233,26 @@ final class StuckKeyRecoveryService { lines.append("loadavg: (unavailable)") } lines.append("top_cpu_processes:") - lines.append(contentsOf: topCPUProcessLines(limit: 5)) + await lines.append(contentsOf: topCPUProcessLines(limit: 5)) return lines } /// Top CPU-consuming processes via `ps`. Skipped under tests — spawning process-listing - /// tools in the test environment can deadlock (see KeyPathTestCase). - nonisolated static func topCPUProcessLines(limit: Int) -> [String] { + /// tools in the test environment can deadlock (see KeyPathTestCase). The timeout keeps a + /// slow `ps` (plausible under the very CPU starvation being diagnosed) from stalling the + /// snapshot that recovery awaits before restarting kanata. + nonisolated static func topCPUProcessLines(limit: Int) async -> [String] { guard !TestEnvironment.isRunningTests else { return [" (skipped in tests)"] } - let process = Process() - process.executableURL = URL(fileURLWithPath: "/bin/ps") - process.arguments = ["-Aceo", "pcpu,pid,comm", "-r"] - let pipe = Pipe() - process.standardOutput = pipe - process.standardError = FileHandle.nullDevice - do { - try process.run() - let data = pipe.fileHandleForReading.readDataToEndOfFile() - process.waitUntilExit() - guard let output = String(data: data, encoding: .utf8) else { - return [" (ps output unreadable)"] - } - let rows = output.components(separatedBy: .newlines) + let result = try await SubprocessRunner.shared.run( + "/bin/ps", + args: ["-Aceo", "pcpu,pid,comm", "-r"], + timeout: 5 + ) + let rows = result.stdout.components(separatedBy: .newlines) .dropFirst() // header .map { $0.trimmingCharacters(in: .whitespaces) } .filter { !$0.isEmpty } diff --git a/Tests/KeyPathTests/Services/StuckKeyRecoveryServiceTests.swift b/Tests/KeyPathTests/Services/StuckKeyRecoveryServiceTests.swift index 71a462633..b9605238c 100644 --- a/Tests/KeyPathTests/Services/StuckKeyRecoveryServiceTests.swift +++ b/Tests/KeyPathTests/Services/StuckKeyRecoveryServiceTests.swift @@ -164,13 +164,33 @@ final class StuckKeyRecoveryServiceTests: KeyPathTestCase { XCTAssertTrue(tail.contains("virtual_hid_keyboard_ready false"), "Driver-disconnect lines must survive the filter") } + func testLogTailTruncatedReadDropsPartialFirstLine() throws { + let fileURL = FileManager.default.temporaryDirectory + .appendingPathComponent("stuck-key-logtail-big-\(UUID().uuidString).log") + defer { try? FileManager.default.removeItem(at: fileURL) } + + // Over the 1MB tail-read cap, with multibyte characters so the seek boundary can + // land mid-character — the tail must still decode and contain only complete lines. + let padding = "héartbéat " + String(repeating: "x", count: 100) + let lineCount = 12000 + let content = (0 ..< lineCount).map { "line \($0) \(padding)" }.joined(separator: "\n") + XCTAssertGreaterThan(content.utf8.count, 1024 * 1024, "Fixture must exceed the tail-read cap") + try content.write(to: fileURL, atomically: true, encoding: .utf8) + + let tail = StuckKeyRecoveryService.logTail(path: fileURL.path, maxLines: 10) + + XCTAssertEqual(tail.count, 10) + XCTAssertTrue(tail.allSatisfy { $0.hasPrefix("line ") }, "No partial first line should survive truncation") + XCTAssertEqual(tail.last, "line \(lineCount - 1) \(padding)") + } + func testLogTailHandlesMissingFile() { let tail = StuckKeyRecoveryService.logTail(path: "/nonexistent/keypath-test.log", maxLines: 50) XCTAssertEqual(tail, ["(file not readable)"]) } - func testSystemLoadLinesSkipProcessListingInTests() { - let lines = StuckKeyRecoveryService.systemLoadLines() + func testSystemLoadLinesSkipProcessListingInTests() async { + let lines = await StuckKeyRecoveryService.systemLoadLines() XCTAssertTrue( lines.first?.hasPrefix("loadavg: ") == true, From ba0bc38dfd1d4030d72336014acd1abf2329aca0 Mon Sep 17 00:00:00 2001 From: Micah Alpern Date: Thu, 11 Jun 2026 12:11:44 -0700 Subject: [PATCH 3/3] logTail: drop empty/trailing-newline lines, document marker off-by-one Hardening from claude-review on #907: a \n-terminated log (the normal case) produced an empty-string artifact as the tail's last element; all empty lines are now dropped. The truncation marker is documented as being on top of maxLines. Co-Authored-By: Claude Fable 5 --- .../Monitoring/StuckKeyRecoveryService.swift | 17 +++++++++++------ .../Services/StuckKeyRecoveryServiceTests.swift | 4 +++- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/Sources/KeyPathAppKit/Services/Monitoring/StuckKeyRecoveryService.swift b/Sources/KeyPathAppKit/Services/Monitoring/StuckKeyRecoveryService.swift index 818892b2c..5c18866c9 100644 --- a/Sources/KeyPathAppKit/Services/Monitoring/StuckKeyRecoveryService.swift +++ b/Sources/KeyPathAppKit/Services/Monitoring/StuckKeyRecoveryService.swift @@ -199,10 +199,12 @@ final class StuckKeyRecoveryService { // MARK: - Capture Helpers - /// Last `maxLines` lines of a log file, optionally dropping lines that contain `filter`. - /// Reads only the trailing `maxBytes` of the file — the kanata stdout log can exceed - /// 100MB; never load it whole. When the read is truncated, a marker line is prepended - /// so a capture dominated by filtered spam can't masquerade as the full history. + /// Last `maxLines` non-empty lines of a log file, optionally dropping lines that + /// contain `filter`. Reads only the trailing `maxBytes` of the file — the kanata + /// stdout log can exceed 100MB; never load it whole. When the read is truncated, a + /// marker line is prepended (on top of `maxLines`, so the result can be + /// `maxLines + 1` long) so a capture dominated by filtered spam can't masquerade + /// as the full history. nonisolated static func logTail( path: String, maxLines: Int, @@ -230,8 +232,11 @@ final class StuckKeyRecoveryService { if offset > 0 { tail = Array(tail.dropFirst()) // first line of a truncated read is partial } - if let filter { - tail = tail.filter { !$0.contains(filter) } + // Drop empties (including the artifact of a trailing newline) and filtered spam. + tail = tail.filter { line in + guard !line.isEmpty else { return false } + guard let filter else { return true } + return !line.contains(filter) } var result = Array(tail.suffix(maxLines)) if offset > 0 { diff --git a/Tests/KeyPathTests/Services/StuckKeyRecoveryServiceTests.swift b/Tests/KeyPathTests/Services/StuckKeyRecoveryServiceTests.swift index b04cabf16..2cf383904 100644 --- a/Tests/KeyPathTests/Services/StuckKeyRecoveryServiceTests.swift +++ b/Tests/KeyPathTests/Services/StuckKeyRecoveryServiceTests.swift @@ -160,8 +160,10 @@ final class StuckKeyRecoveryServiceTests: KeyPathTestCase { .appendingPathComponent("stuck-key-tail-test-\(UUID().uuidString).log") defer { try? FileManager.default.removeItem(at: url) } + // Trailing newline matches real log files; the tail must not end with an + // empty-string artifact from splitting on it. let lines = (1 ... 500).map { "line \($0)" } - try lines.joined(separator: "\n").write(to: url, atomically: true, encoding: .utf8) + try (lines.joined(separator: "\n") + "\n").write(to: url, atomically: true, encoding: .utf8) let tail = StuckKeyRecoveryService.logTail(path: url.path, maxLines: 3) XCTAssertEqual(tail, ["line 498", "line 499", "line 500"])