diff --git a/Sources/KeyPathAppKit/Services/Monitoring/StuckKeyRecoveryService.swift b/Sources/KeyPathAppKit/Services/Monitoring/StuckKeyRecoveryService.swift index fd2ee74f0..5c18866c9 100644 --- a/Sources/KeyPathAppKit/Services/Monitoring/StuckKeyRecoveryService.swift +++ b/Sources/KeyPathAppKit/Services/Monitoring/StuckKeyRecoveryService.swift @@ -95,7 +95,7 @@ final class StuckKeyRecoveryService { ) await Task.detached(priority: .utility) { - Self.writeSnapshotToDisk(snapshot) + await Self.writeSnapshotToDisk(snapshot) }.value } @@ -107,7 +107,7 @@ final class StuckKeyRecoveryService { AppPaths.logsDirectory.appendingPathComponent("stuck-key-incidents", isDirectory: true) } - private nonisolated static func writeSnapshotToDisk(_ snapshot: DiagnosticSnapshotData) { + private nonisolated static func writeSnapshotToDisk(_ snapshot: DiagnosticSnapshotData) async { let diagnosticsDir = diagnosticsDirectory let timestamp = ISO8601DateFormatter().string(from: Date()) @@ -140,23 +140,20 @@ final class StuckKeyRecoveryService { lines.append("") lines.append("=== Kanata Stderr (last 50 lines) ===") - if let data = FileManager.default.contents(atPath: "/var/log/com.keypath.kanata.stderr.log"), - let content = String(data: data, encoding: .utf8) - { - lines.append(contentsOf: content.components(separatedBy: .newlines).suffix(50)) - } else { - lines.append("(file not readable)") - } + lines.append(contentsOf: logTail(path: KeyPathConstants.Logs.kanataStderr, maxLines: 50)) lines.append("") - // The diagnostic gold lives in stdout, not stderr: key-input traces, - // VHID connection events ("connected", "driver connected:", - // "output backend unavailable during write", "dropping KEY_X Release"). - // The 2026-06-10 MAL-57 incidents were diagnosed entirely from stdout. - lines.append("=== Kanata Stdout (last 120 lines) ===") - lines.append(contentsOf: tailOfFile( - atPath: KeyPathConstants.Logs.kanataStdout, - lineCount: 120 + // The diagnostic gold lives in stdout, not stderr: key-input traces, VHID + // connection events ("connected", "driver connected:", "output backend + // unavailable during write", "dropping KEY_X Release"). The 2026-06-10 MAL-57 + // incidents were diagnosed entirely from stdout. The high-volume + // "virtual_hid_keyboard_ready true" heartbeat is filtered out; "... false" + // lines are kept because they indicate driver disconnects. + lines.append("=== Kanata Stdout (last 200 lines, vhid-ready spam filtered) ===") + lines.append(contentsOf: logTail( + path: KeyPathConstants.Logs.kanataStdout, + maxLines: 200, + excludingLinesContaining: "virtual_hid_keyboard_ready true" )) lines.append("") @@ -164,17 +161,7 @@ final class StuckKeyRecoveryService { // behind stuck keys (MAL-57); record load so incidents can confirm or // refute the correlation without a live investigation. lines.append("=== System Load ===") - var loads = [Double](repeating: 0, count: 3) - if getloadavg(&loads, 3) == 3 { - lines.append("loadavg_1m_5m_15m: \(loads[0]) \(loads[1]) \(loads[2])") - } else { - lines.append("loadavg_1m_5m_15m: (unavailable)") - } - lines.append("active_cpu_count: \(ProcessInfo.processInfo.activeProcessorCount)") - if !TestEnvironment.isRunningTests { - lines.append("top_cpu_processes:") - lines.append(contentsOf: topCPUProcesses()) - } + await lines.append(contentsOf: systemLoadLines()) lines.append("") lines.append("=== Recovery History ===") @@ -210,53 +197,94 @@ final class StuckKeyRecoveryService { } } - /// Bounded tail of a possibly-huge log file. The kanata stdout log can - /// exceed 100MB; never load it whole. Internal for testability. - nonisolated static func tailOfFile( - atPath path: String, - lineCount: Int, - maxBytes: Int = 64 * 1024 + // MARK: - Capture Helpers + + /// Last `maxLines` non-empty lines of a log file, optionally dropping lines that + /// contain `filter`. Reads only the trailing `maxBytes` of the file — the kanata + /// stdout log can exceed 100MB; never load it whole. When the read is truncated, a + /// marker line is prepended (on top of `maxLines`, so the result can be + /// `maxLines + 1` long) so a capture dominated by filtered spam can't masquerade + /// as the full history. + nonisolated static func logTail( + path: String, + maxLines: Int, + maxBytes: UInt64 = 1024 * 1024, + excludingLinesContaining filter: String? = nil ) -> [String] { - guard let fileHandle = FileHandle(forReadingAtPath: path) else { - return ["(file not readable)"] + let unreadable = ["(file not readable)"] + guard let handle = FileHandle(forReadingAtPath: path) else { + return unreadable } - defer { try? fileHandle.close() } + defer { try? handle.close() } - let fileSize: UInt64 = (try? fileHandle.seekToEnd()) ?? 0 - let offset = fileSize > UInt64(maxBytes) ? fileSize - UInt64(maxBytes) : 0 - try? fileHandle.seek(toOffset: offset) - guard let data = try? fileHandle.readToEnd(), - let content = String(data: data, encoding: .utf8) - else { - return ["(file not readable)"] + guard let size = try? handle.seekToEnd() else { + return unreadable + } + let offset = size > maxBytes ? size - maxBytes : 0 + try? handle.seek(toOffset: offset) + guard let data = try? handle.readToEnd() else { + return unreadable + } + + // Lossy decode: a truncated read can start mid-multibyte-character, which would + // make String(data:encoding:) fail for the whole chunk. + var tail = String(decoding: data, as: UTF8.self).components(separatedBy: .newlines) + if offset > 0 { + tail = Array(tail.dropFirst()) // first line of a truncated read is partial } - return Array(content.components(separatedBy: .newlines).suffix(lineCount)) + // Drop empties (including the artifact of a trailing newline) and filtered spam. + tail = tail.filter { line in + guard !line.isEmpty else { return false } + guard let filter else { return true } + return !line.contains(filter) + } + var result = Array(tail.suffix(maxLines)) + if offset > 0 { + result.insert("(… older output truncated: tail window is the last \(maxBytes / 1024)KB of the file)", at: 0) + } + return result + } + + /// Load average, core count, and the top CPU consumers — the leading MAL-57 + /// hypothesis is CPU-starvation-induced heartbeat misses, so each capture + /// needs load context. + nonisolated static func systemLoadLines() async -> [String] { + var lines: [String] = [] + var loads = [Double](repeating: 0, count: 3) + if getloadavg(&loads, 3) == 3 { + lines.append(String(format: "loadavg_1m_5m_15m: %.2f %.2f %.2f", loads[0], loads[1], loads[2])) + } else { + lines.append("loadavg_1m_5m_15m: (unavailable)") + } + lines.append("active_cpu_count: \(ProcessInfo.processInfo.activeProcessorCount)") + lines.append("top_cpu_processes:") + await lines.append(contentsOf: topCPUProcessLines(limit: 5)) + return lines } - /// Top CPU consumers at capture time. Spawns `ps`; callers must not invoke - /// this in tests (subprocess spawning is forbidden there — see KeyPathTestCase). - private nonisolated static func topCPUProcesses(count: Int = 5) -> [String] { - let process = Process() - process.executableURL = URL(fileURLWithPath: "/bin/ps") - process.arguments = ["-Aceo", "pcpu,comm", "-r"] - let pipe = Pipe() - process.standardOutput = pipe - process.standardError = Pipe() + /// Top CPU-consuming processes via `ps`. Skipped under tests — spawning process-listing + /// tools in the test environment can deadlock (see KeyPathTestCase). The timeout keeps a + /// slow `ps` (plausible under the very CPU starvation being diagnosed) from stalling the + /// snapshot that recovery awaits before restarting kanata. + nonisolated static func topCPUProcessLines(limit: Int) async -> [String] { + guard !TestEnvironment.isRunningTests else { + return [" (skipped in tests)"] + } + do { - try process.run() + let result = try await SubprocessRunner.shared.run( + "/bin/ps", + args: ["-Aceo", "pcpu,pid,comm", "-r"], + timeout: 5 + ) + let rows = result.stdout.components(separatedBy: .newlines) + .dropFirst() // header + .map { $0.trimmingCharacters(in: .whitespaces) } + .filter { !$0.isEmpty } + return rows.prefix(limit).map { " \($0)" } } catch { - return [" (ps failed to launch: \(error.localizedDescription))"] - } - let data = pipe.fileHandleForReading.readDataToEndOfFile() - process.waitUntilExit() - guard let output = String(data: data, encoding: .utf8) else { - return [" (ps output unreadable)"] + return [" (ps failed: \(error.localizedDescription))"] } - return output - .components(separatedBy: .newlines) - .dropFirst() // header - .prefix(count) - .map { " \($0.trimmingCharacters(in: .whitespaces))" } } } diff --git a/Tests/KeyPathTests/Services/StuckKeyRecoveryServiceTests.swift b/Tests/KeyPathTests/Services/StuckKeyRecoveryServiceTests.swift index 0816a9f85..2cf383904 100644 --- a/Tests/KeyPathTests/Services/StuckKeyRecoveryServiceTests.swift +++ b/Tests/KeyPathTests/Services/StuckKeyRecoveryServiceTests.swift @@ -153,27 +153,102 @@ final class StuckKeyRecoveryServiceTests: KeyPathTestCase { ) } - // MARK: - Incident Capture Helpers + // MARK: - Capture Helpers - func testTailOfFileReturnsLastLinesWithoutLoadingWholeFile() throws { + func testLogTailReturnsLastLinesWithoutLoadingWholeFile() throws { let url = FileManager.default.temporaryDirectory .appendingPathComponent("stuck-key-tail-test-\(UUID().uuidString).log") defer { try? FileManager.default.removeItem(at: url) } + // Trailing newline matches real log files; the tail must not end with an + // empty-string artifact from splitting on it. let lines = (1 ... 500).map { "line \($0)" } - try lines.joined(separator: "\n").write(to: url, atomically: true, encoding: .utf8) + try (lines.joined(separator: "\n") + "\n").write(to: url, atomically: true, encoding: .utf8) - let tail = StuckKeyRecoveryService.tailOfFile(atPath: url.path, lineCount: 3) + let tail = StuckKeyRecoveryService.logTail(path: url.path, maxLines: 3) XCTAssertEqual(tail, ["line 498", "line 499", "line 500"]) // Bounded read: a tiny maxBytes window must still return the newest // lines, proving we read from the end rather than the start. - let bounded = StuckKeyRecoveryService.tailOfFile(atPath: url.path, lineCount: 2, maxBytes: 64) + let bounded = StuckKeyRecoveryService.logTail(path: url.path, maxLines: 2, maxBytes: 64) XCTAssertEqual(bounded.suffix(1), ["line 500"]) + XCTAssertTrue( + bounded.first?.contains("truncated") == true, + "A truncated tail window must announce itself" + ) + } + + func testLogTailFiltersSpamAndLimitsLines() throws { + let fileURL = FileManager.default.temporaryDirectory + .appendingPathComponent("stuck-key-logtail-\(UUID().uuidString).log") + defer { try? FileManager.default.removeItem(at: fileURL) } + + // Interleave useful lines with the heartbeat spam the filter must drop. The + // "false" variant is diagnostic (driver disconnect) and must survive the filter. + var content: [String] = [] + for index in 0 ..< 300 { + content.append("virtual_hid_keyboard_ready true") + content.append("useful line \(index)") + } + content.append("virtual_hid_keyboard_ready false") + content.append("dropping KEY_T Release") + try content.joined(separator: "\n").write(to: fileURL, atomically: true, encoding: .utf8) + + let tail = StuckKeyRecoveryService.logTail( + path: fileURL.path, + maxLines: 200, + excludingLinesContaining: "virtual_hid_keyboard_ready true" + ) - XCTAssertEqual( - StuckKeyRecoveryService.tailOfFile(atPath: "/nonexistent/path.log", lineCount: 3), - ["(file not readable)"] + XCTAssertEqual(tail.count, 200) + XCTAssertFalse(tail.contains("virtual_hid_keyboard_ready true"), "Heartbeat spam should be filtered") + XCTAssertEqual(tail.last, "dropping KEY_T Release", "Most recent lines should be kept") + XCTAssertTrue(tail.contains("virtual_hid_keyboard_ready false"), "Driver-disconnect lines must survive the filter") + } + + func testLogTailTruncatedReadDropsPartialFirstLine() throws { + let fileURL = FileManager.default.temporaryDirectory + .appendingPathComponent("stuck-key-logtail-big-\(UUID().uuidString).log") + defer { try? FileManager.default.removeItem(at: fileURL) } + + // Over the 1MB tail-read cap, with multibyte characters so the seek boundary can + // land mid-character — the tail must still decode and contain only complete lines. + let padding = "héartbéat " + String(repeating: "x", count: 100) + let lineCount = 12000 + let content = (0 ..< lineCount).map { "line \($0) \(padding)" }.joined(separator: "\n") + XCTAssertGreaterThan(content.utf8.count, 1024 * 1024, "Fixture must exceed the tail-read cap") + try content.write(to: fileURL, atomically: true, encoding: .utf8) + + let tail = StuckKeyRecoveryService.logTail(path: fileURL.path, maxLines: 10) + + XCTAssertEqual(tail.count, 11, "10 content lines plus the truncation marker") + XCTAssertTrue( + tail.first?.contains("truncated") == true, + "A truncated tail window must announce itself" + ) + XCTAssertTrue( + tail.dropFirst().allSatisfy { $0.hasPrefix("line ") }, + "No partial first line should survive truncation" + ) + XCTAssertEqual(tail.last, "line \(lineCount - 1) \(padding)") + } + + func testLogTailHandlesMissingFile() { + let tail = StuckKeyRecoveryService.logTail(path: "/nonexistent/keypath-test.log", maxLines: 50) + XCTAssertEqual(tail, ["(file not readable)"]) + } + + func testSystemLoadLinesSkipProcessListingInTests() async { + let lines = await StuckKeyRecoveryService.systemLoadLines() + + XCTAssertTrue( + lines.first?.hasPrefix("loadavg_1m_5m_15m: ") == true, + "Load average should always be captured (libc call, safe in tests)" + ) + XCTAssertTrue(lines.contains("top_cpu_processes:")) + XCTAssertTrue( + lines.contains(" (skipped in tests)"), + "Process listing must not shell out under tests" ) } diff --git a/docs/bugs/MAL-57-duplicate-keypresses.md b/docs/bugs/MAL-57-duplicate-keypresses.md index 0be3c1231..c3f33b18c 100644 --- a/docs/bugs/MAL-57-duplicate-keypresses.md +++ b/docs/bugs/MAL-57-duplicate-keypresses.md @@ -137,9 +137,14 @@ process key reports. Telemetry shipped alongside: all VHID client connection lines are timestamped (`vhid-client:` prefix), and the previously silent heartbeat-deadline reconnect path logs a stamped reason. Incident capture - (`StuckKeyRecoveryService`, #897) now snapshots the kanata *stdout* tail - (where the diagnostic evidence lives), loadavg/core count, and top CPU - consumers; the detector's GUI-only limitation is documented in the class. + (`StuckKeyRecoveryService`, #897/#907) now snapshots the kanata *stdout* tail + (where the diagnostic evidence lives) with the `virtual_hid_keyboard_ready + true` heartbeat spam filtered out (`... false` lines are kept — they mark + driver disconnects) and an explicit marker when the 1MB tail window + truncates older output; plus loadavg/core count and top CPU consumers + (via `SubprocessRunner` with a timeout, so a starved `ps` cannot delay the + recovery restart). The detector's GUI-only limitation is documented in the + class. Residual gap no fix can fully close: a release written into a socket that is dying but not yet declared dead is unrecoverable in transit; Layer 2 bounds the