Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@
)

await Task.detached(priority: .utility) {
Self.writeSnapshotToDisk(snapshot)
await Self.writeSnapshotToDisk(snapshot)
}.value
}

Expand All @@ -107,7 +107,7 @@
AppPaths.logsDirectory.appendingPathComponent("stuck-key-incidents", isDirectory: true)
}

private nonisolated static func writeSnapshotToDisk(_ snapshot: DiagnosticSnapshotData) {
private nonisolated static func writeSnapshotToDisk(_ snapshot: DiagnosticSnapshotData) async {
let diagnosticsDir = diagnosticsDirectory

let timestamp = ISO8601DateFormatter().string(from: Date())
Expand Down Expand Up @@ -140,41 +140,28 @@
lines.append("")

lines.append("=== Kanata Stderr (last 50 lines) ===")
if let data = FileManager.default.contents(atPath: "/var/log/com.keypath.kanata.stderr.log"),
let content = String(data: data, encoding: .utf8)
{
lines.append(contentsOf: content.components(separatedBy: .newlines).suffix(50))
} else {
lines.append("(file not readable)")
}
lines.append(contentsOf: logTail(path: KeyPathConstants.Logs.kanataStderr, maxLines: 50))
lines.append("")

// The diagnostic gold lives in stdout, not stderr: key-input traces,
// VHID connection events ("connected", "driver connected:",
// "output backend unavailable during write", "dropping KEY_X Release").
// The 2026-06-10 MAL-57 incidents were diagnosed entirely from stdout.
lines.append("=== Kanata Stdout (last 120 lines) ===")
lines.append(contentsOf: tailOfFile(
atPath: KeyPathConstants.Logs.kanataStdout,
lineCount: 120
// The diagnostic gold lives in stdout, not stderr: key-input traces, VHID
// connection events ("connected", "driver connected:", "output backend
// unavailable during write", "dropping KEY_X Release"). The 2026-06-10 MAL-57
// incidents were diagnosed entirely from stdout. The high-volume
// "virtual_hid_keyboard_ready true" heartbeat is filtered out; "... false"
// lines are kept because they indicate driver disconnects.
lines.append("=== Kanata Stdout (last 200 lines, vhid-ready spam filtered) ===")
lines.append(contentsOf: logTail(
path: KeyPathConstants.Logs.kanataStdout,
maxLines: 200,
excludingLinesContaining: "virtual_hid_keyboard_ready true"
))
lines.append("")

// CPU starvation is the leading trigger for the VHID disconnects
// behind stuck keys (MAL-57); record load so incidents can confirm or
// refute the correlation without a live investigation.
lines.append("=== System Load ===")
var loads = [Double](repeating: 0, count: 3)
if getloadavg(&loads, 3) == 3 {
lines.append("loadavg_1m_5m_15m: \(loads[0]) \(loads[1]) \(loads[2])")
} else {
lines.append("loadavg_1m_5m_15m: (unavailable)")
}
lines.append("active_cpu_count: \(ProcessInfo.processInfo.activeProcessorCount)")
if !TestEnvironment.isRunningTests {
lines.append("top_cpu_processes:")
lines.append(contentsOf: topCPUProcesses())
}
await lines.append(contentsOf: systemLoadLines())
lines.append("")

lines.append("=== Recovery History ===")
Expand Down Expand Up @@ -210,53 +197,94 @@
}
}

/// Bounded tail of a possibly-huge log file. The kanata stdout log can
/// exceed 100MB; never load it whole. Internal for testability.
nonisolated static func tailOfFile(
atPath path: String,
lineCount: Int,
maxBytes: Int = 64 * 1024
// MARK: - Capture Helpers

/// Last `maxLines` non-empty lines of a log file, optionally dropping lines that
/// contain `filter`. Reads only the trailing `maxBytes` of the file — the kanata
/// stdout log can exceed 100MB; never load it whole. When the read is truncated, a
/// marker line is prepended (on top of `maxLines`, so the result can be
/// `maxLines + 1` long) so a capture dominated by filtered spam can't masquerade
/// as the full history.
nonisolated static func logTail(
path: String,
maxLines: Int,
maxBytes: UInt64 = 1024 * 1024,
excludingLinesContaining filter: String? = nil
) -> [String] {
guard let fileHandle = FileHandle(forReadingAtPath: path) else {
return ["(file not readable)"]
let unreadable = ["(file not readable)"]
guard let handle = FileHandle(forReadingAtPath: path) else {
return unreadable
}
defer { try? fileHandle.close() }
defer { try? handle.close() }

let fileSize: UInt64 = (try? fileHandle.seekToEnd()) ?? 0
let offset = fileSize > UInt64(maxBytes) ? fileSize - UInt64(maxBytes) : 0
try? fileHandle.seek(toOffset: offset)
guard let data = try? fileHandle.readToEnd(),
let content = String(data: data, encoding: .utf8)
else {
return ["(file not readable)"]
guard let size = try? handle.seekToEnd() else {
return unreadable
}
let offset = size > maxBytes ? size - maxBytes : 0
try? handle.seek(toOffset: offset)
guard let data = try? handle.readToEnd() else {
return unreadable
}

// Lossy decode: a truncated read can start mid-multibyte-character, which would
// make String(data:encoding:) fail for the whole chunk.
var tail = String(decoding: data, as: UTF8.self).components(separatedBy: .newlines)

Check warning on line 231 in Sources/KeyPathAppKit/Services/Monitoring/StuckKeyRecoveryService.swift

View workflow job for this annotation

GitHub Actions / code-quality

Prefer failable `String(bytes:encoding:)` initializer when converting `Data` to `String` (optional_data_string_conversion)
if offset > 0 {
tail = Array(tail.dropFirst()) // first line of a truncated read is partial
}
return Array(content.components(separatedBy: .newlines).suffix(lineCount))
// Drop empties (including the artifact of a trailing newline) and filtered spam.
tail = tail.filter { line in
guard !line.isEmpty else { return false }
guard let filter else { return true }
return !line.contains(filter)
}
var result = Array(tail.suffix(maxLines))
if offset > 0 {
result.insert("(… older output truncated: tail window is the last \(maxBytes / 1024)KB of the file)", at: 0)
}
return result
}

/// Load average, core count, and the top CPU consumers — the leading MAL-57
/// hypothesis is CPU-starvation-induced heartbeat misses, so each capture
/// needs load context.
nonisolated static func systemLoadLines() async -> [String] {
var lines: [String] = []
var loads = [Double](repeating: 0, count: 3)
if getloadavg(&loads, 3) == 3 {
lines.append(String(format: "loadavg_1m_5m_15m: %.2f %.2f %.2f", loads[0], loads[1], loads[2]))
} else {
lines.append("loadavg_1m_5m_15m: (unavailable)")
}
lines.append("active_cpu_count: \(ProcessInfo.processInfo.activeProcessorCount)")
lines.append("top_cpu_processes:")
await lines.append(contentsOf: topCPUProcessLines(limit: 5))
return lines
}

/// Top CPU consumers at capture time. Spawns `ps`; callers must not invoke
/// this in tests (subprocess spawning is forbidden there — see KeyPathTestCase).
private nonisolated static func topCPUProcesses(count: Int = 5) -> [String] {
let process = Process()
process.executableURL = URL(fileURLWithPath: "/bin/ps")
process.arguments = ["-Aceo", "pcpu,comm", "-r"]
let pipe = Pipe()
process.standardOutput = pipe
process.standardError = Pipe()
/// Top CPU-consuming processes via `ps`. Skipped under tests — spawning process-listing
/// tools in the test environment can deadlock (see KeyPathTestCase). The timeout keeps a
/// slow `ps` (plausible under the very CPU starvation being diagnosed) from stalling the
/// snapshot that recovery awaits before restarting kanata.
nonisolated static func topCPUProcessLines(limit: Int) async -> [String] {
guard !TestEnvironment.isRunningTests else {
return [" (skipped in tests)"]
}

do {
try process.run()
let result = try await SubprocessRunner.shared.run(
"/bin/ps",
args: ["-Aceo", "pcpu,pid,comm", "-r"],
timeout: 5
)
let rows = result.stdout.components(separatedBy: .newlines)
.dropFirst() // header
.map { $0.trimmingCharacters(in: .whitespaces) }
.filter { !$0.isEmpty }
return rows.prefix(limit).map { " \($0)" }
} catch {
return [" (ps failed to launch: \(error.localizedDescription))"]
}
let data = pipe.fileHandleForReading.readDataToEndOfFile()
process.waitUntilExit()
guard let output = String(data: data, encoding: .utf8) else {
return [" (ps output unreadable)"]
return [" (ps failed: \(error.localizedDescription))"]
}
return output
.components(separatedBy: .newlines)
.dropFirst() // header
.prefix(count)
.map { " \($0.trimmingCharacters(in: .whitespaces))" }
}
}

Expand Down
91 changes: 83 additions & 8 deletions Tests/KeyPathTests/Services/StuckKeyRecoveryServiceTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -153,27 +153,102 @@ final class StuckKeyRecoveryServiceTests: KeyPathTestCase {
)
}

// MARK: - Incident Capture Helpers
// MARK: - Capture Helpers

func testTailOfFileReturnsLastLinesWithoutLoadingWholeFile() throws {
func testLogTailReturnsLastLinesWithoutLoadingWholeFile() throws {
let url = FileManager.default.temporaryDirectory
.appendingPathComponent("stuck-key-tail-test-\(UUID().uuidString).log")
defer { try? FileManager.default.removeItem(at: url) }

// Trailing newline matches real log files; the tail must not end with an
// empty-string artifact from splitting on it.
let lines = (1 ... 500).map { "line \($0)" }
try lines.joined(separator: "\n").write(to: url, atomically: true, encoding: .utf8)
try (lines.joined(separator: "\n") + "\n").write(to: url, atomically: true, encoding: .utf8)

let tail = StuckKeyRecoveryService.tailOfFile(atPath: url.path, lineCount: 3)
let tail = StuckKeyRecoveryService.logTail(path: url.path, maxLines: 3)
XCTAssertEqual(tail, ["line 498", "line 499", "line 500"])

// Bounded read: a tiny maxBytes window must still return the newest
// lines, proving we read from the end rather than the start.
let bounded = StuckKeyRecoveryService.tailOfFile(atPath: url.path, lineCount: 2, maxBytes: 64)
let bounded = StuckKeyRecoveryService.logTail(path: url.path, maxLines: 2, maxBytes: 64)
XCTAssertEqual(bounded.suffix(1), ["line 500"])
XCTAssertTrue(
bounded.first?.contains("truncated") == true,
"A truncated tail window must announce itself"
)
}

func testLogTailFiltersSpamAndLimitsLines() throws {
let fileURL = FileManager.default.temporaryDirectory
.appendingPathComponent("stuck-key-logtail-\(UUID().uuidString).log")
defer { try? FileManager.default.removeItem(at: fileURL) }

// Interleave useful lines with the heartbeat spam the filter must drop. The
// "false" variant is diagnostic (driver disconnect) and must survive the filter.
var content: [String] = []
for index in 0 ..< 300 {
content.append("virtual_hid_keyboard_ready true")
content.append("useful line \(index)")
}
content.append("virtual_hid_keyboard_ready false")
content.append("dropping KEY_T Release")
try content.joined(separator: "\n").write(to: fileURL, atomically: true, encoding: .utf8)

let tail = StuckKeyRecoveryService.logTail(
path: fileURL.path,
maxLines: 200,
excludingLinesContaining: "virtual_hid_keyboard_ready true"
)

XCTAssertEqual(
StuckKeyRecoveryService.tailOfFile(atPath: "/nonexistent/path.log", lineCount: 3),
["(file not readable)"]
XCTAssertEqual(tail.count, 200)
XCTAssertFalse(tail.contains("virtual_hid_keyboard_ready true"), "Heartbeat spam should be filtered")
XCTAssertEqual(tail.last, "dropping KEY_T Release", "Most recent lines should be kept")
XCTAssertTrue(tail.contains("virtual_hid_keyboard_ready false"), "Driver-disconnect lines must survive the filter")
}

func testLogTailTruncatedReadDropsPartialFirstLine() throws {
let fileURL = FileManager.default.temporaryDirectory
.appendingPathComponent("stuck-key-logtail-big-\(UUID().uuidString).log")
defer { try? FileManager.default.removeItem(at: fileURL) }

// Over the 1MB tail-read cap, with multibyte characters so the seek boundary can
// land mid-character — the tail must still decode and contain only complete lines.
let padding = "héartbéat " + String(repeating: "x", count: 100)
let lineCount = 12000
let content = (0 ..< lineCount).map { "line \($0) \(padding)" }.joined(separator: "\n")
XCTAssertGreaterThan(content.utf8.count, 1024 * 1024, "Fixture must exceed the tail-read cap")
try content.write(to: fileURL, atomically: true, encoding: .utf8)

let tail = StuckKeyRecoveryService.logTail(path: fileURL.path, maxLines: 10)

XCTAssertEqual(tail.count, 11, "10 content lines plus the truncation marker")
XCTAssertTrue(
tail.first?.contains("truncated") == true,
"A truncated tail window must announce itself"
)
XCTAssertTrue(
tail.dropFirst().allSatisfy { $0.hasPrefix("line ") },
"No partial first line should survive truncation"
)
XCTAssertEqual(tail.last, "line \(lineCount - 1) \(padding)")
}

func testLogTailHandlesMissingFile() {
let tail = StuckKeyRecoveryService.logTail(path: "/nonexistent/keypath-test.log", maxLines: 50)
XCTAssertEqual(tail, ["(file not readable)"])
}

func testSystemLoadLinesSkipProcessListingInTests() async {
let lines = await StuckKeyRecoveryService.systemLoadLines()

XCTAssertTrue(
lines.first?.hasPrefix("loadavg_1m_5m_15m: ") == true,
"Load average should always be captured (libc call, safe in tests)"
)
XCTAssertTrue(lines.contains("top_cpu_processes:"))
XCTAssertTrue(
lines.contains(" (skipped in tests)"),
"Process listing must not shell out under tests"
)
}

Expand Down
11 changes: 8 additions & 3 deletions docs/bugs/MAL-57-duplicate-keypresses.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,14 @@ process key reports.
Telemetry shipped alongside: all VHID client connection lines are
timestamped (`vhid-client:` prefix), and the previously silent
heartbeat-deadline reconnect path logs a stamped reason. Incident capture
(`StuckKeyRecoveryService`, #897) now snapshots the kanata *stdout* tail
(where the diagnostic evidence lives), loadavg/core count, and top CPU
consumers; the detector's GUI-only limitation is documented in the class.
(`StuckKeyRecoveryService`, #897/#907) now snapshots the kanata *stdout* tail
(where the diagnostic evidence lives) with the `virtual_hid_keyboard_ready
true` heartbeat spam filtered out (`... false` lines are kept — they mark
driver disconnects) and an explicit marker when the 1MB tail window
truncates older output; plus loadavg/core count and top CPU consumers
(via `SubprocessRunner` with a timeout, so a starved `ps` cannot delay the
recovery restart). The detector's GUI-only limitation is documented in the
class.

Residual gap no fix can fully close: a release written into a socket that is
dying but not yet declared dead is unrecoverable in transit; Layer 2 bounds the
Expand Down
Loading