From 7dd112549f616cb3b4497067cb42e4e533bdf114 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Jun 2026 09:53:14 +0000 Subject: [PATCH 1/3] Fix TCP bridge hang when a client stalls or dies silently MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The client socket returned by accept() was blocking with no send timeout, and all bus-forwarding sends used a blocking send(). If a connected client stopped draining (network drop with no FIN/RST), the kernel send buffer filled and send() blocked the bridge task forever. That task is also the only UART reader, so the whole device froze with no panic, no logs and no recovery — matching the observed weeks-stable-then-dead crash. Make the client socket non-blocking and give send_to_client a keep/drop policy: a full send keeps the client, a full send buffer (EAGAIN) drops the message but keeps the connection, and a partial write or hard error drops the client. Add TCP keepalive (idle 30s, interval 5s, count 3) so a silently dead peer is detected and the single client slot freed. Factor the client teardown into a shared helper. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_018oMk4krXAUqhX5gWtvc1mf --- CHANGELOG.md | 1 + main/config.h | 7 +++ main/tcp_bridge.c | 114 ++++++++++++++++++++++++++++++++++++---------- 3 files changed, 98 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 974f3f3..b5c1bba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Frame parser and decoder now also accept "discovery" packets (control bytes `0x00 0x00`): a shorter 11-byte frame shape with no payload and no data-checksum byte, alongside the existing `0x80 0x00` data packets ### Removed ### Fixed +- TCP bridge no longer hangs when a connected client stalls or dies silently. The client socket is now non-blocking with a keep/drop policy on every send (full send keeps the client; a full send buffer drops the message but keeps the connection; a partial write or hard error drops the client), so a stuck peer can never wedge the bridge task — which is also the only task reading the UART, meaning such a wedge previously froze the whole device with no logs and no recovery. Added TCP keepalive (idle 30s, interval 5s, count 3) so a silently dead peer is detected and the single client slot freed. ### Deprecated ### Security diff --git a/main/config.h b/main/config.h index 5256020..77a8704 100644 --- a/main/config.h +++ b/main/config.h @@ -101,6 +101,13 @@ #define TCP_TASK_STACK_SIZE 8192 // TCP bridge task stack size #define TCP_TASK_PRIORITY 5 // TCP bridge task priority +// TCP keepalive — reap a silently dead client (one that never sends FIN/RST) +// so the single client slot is freed. Worst-case detection time is roughly +// IDLE + INTERVAL * COUNT seconds. +#define TCP_KEEPALIVE_IDLE_SEC 30 // Idle seconds before the first keepalive probe +#define TCP_KEEPALIVE_INTERVAL_SEC 5 // Seconds between probes +#define TCP_KEEPALIVE_COUNT 3 // Failed probes before dropping the connection + // ====================================================== // Buffer Sizes // ====================================================== diff --git a/main/tcp_bridge.c b/main/tcp_bridge.c index 285ee36..19a8c95 100644 --- a/main/tcp_bridge.c +++ b/main/tcp_bridge.c @@ -53,16 +53,37 @@ static SemaphoreHandle_t s_stopped_sem = NULL; * Send data to the TCP client under the client mutex. * All sends to client_sock must go through this to prevent interleaving * with the log vprintf callback, which can fire from any task at any time. + * + * The client socket is non-blocking, so a stalled or dead peer can never wedge + * the bridge task. Returns true if the client should be kept, false if it + * should be dropped: + * - full send -> keep + * - buffer full (EAGAIN) -> message dropped, keep. The hex stream is a + * best-effort debug feed; a truly dead peer is + * reaped by TCP keepalive / the recv path. + * - partial write -> drop, since the hex stream is now desynced and a + * clean reconnect is the safe recovery. + * - other send error -> drop. + * Mutex contention is never the client's fault, so it keeps the client. */ -static void send_to_client(int sock, const void *data, int len) +static bool send_to_client(int sock, const void *data, int len) { if (sock < 0 || len <= 0) { - return; + return true; } + bool keep = true; if (s_log_mutex && xSemaphoreTake(s_log_mutex, pdMS_TO_TICKS(MUTEX_TIMEOUT_MS)) == pdTRUE) { - send(sock, data, len, 0); + int sent = send(sock, data, len, MSG_DONTWAIT); + if (sent < 0) { + if (errno != EAGAIN && errno != EWOULDBLOCK) { + keep = false; + } + } else if (sent < len) { + keep = false; + } xSemaphoreGive(s_log_mutex); } + return keep; } /** @@ -109,6 +130,21 @@ static void tcp_bridge_set_log_client(int sock) } } +/** + * Tear down the current TCP client: disable log forwarding, close the socket, + * and reset the per-client line buffer. Safe to call with no client connected. + */ +static void tcp_bridge_close_client(int *client_sock, int *line_pos) +{ + tcp_bridge_set_log_client(-1); // Disable log forwarding before closing the fd + if (*client_sock >= 0) { + shutdown(*client_sock, SHUT_RDWR); + close(*client_sock); + } + *client_sock = -1; + *line_pos = 0; +} + /** * TCP server task implementation @@ -179,18 +215,40 @@ static void tcp_bridge_task(void *pvParameters) inet_ntoa_r(client_addr.sin_addr, addr_str, sizeof(addr_str) - 1); ESP_LOGI(TAG, "Client connected from %s", addr_str); - const char *hello = - "Connected to pool control bus bridge.\r\n" - "UART bytes will be shown here in hex.\r\n" - "Decoded messages will also be shown.\r\n" - "Send hex strings (e.g., '02 00 50 FF FF 03') to transmit to the bus.\r\n\r\n"; - send_to_client(client_sock, hello, strlen(hello)); + // Make the client socket non-blocking so a stalled or dead peer + // can never wedge the bridge task — which is also the only task + // reading the UART. A full send buffer now returns immediately + // instead of blocking the task indefinitely. + int cflags = fcntl(client_sock, F_GETFL, 0); + fcntl(client_sock, F_SETFL, cflags | O_NONBLOCK); + + // Enable TCP keepalive so a silently dead peer (one that never + // sends FIN/RST) is detected and the single client slot freed, + // without relying on the peer to send us anything. + int ka_enable = 1; + int ka_idle = TCP_KEEPALIVE_IDLE_SEC; + int ka_intvl = TCP_KEEPALIVE_INTERVAL_SEC; + int ka_count = TCP_KEEPALIVE_COUNT; + setsockopt(client_sock, SOL_SOCKET, SO_KEEPALIVE, &ka_enable, sizeof(ka_enable)); + setsockopt(client_sock, IPPROTO_TCP, TCP_KEEPIDLE, &ka_idle, sizeof(ka_idle)); + setsockopt(client_sock, IPPROTO_TCP, TCP_KEEPINTVL, &ka_intvl, sizeof(ka_intvl)); + setsockopt(client_sock, IPPROTO_TCP, TCP_KEEPCNT, &ka_count, sizeof(ka_count)); // Enable log forwarding for this client tcp_bridge_set_log_client(client_sock); // Reset line buffer for new client line_pos = 0; + + const char *hello = + "Connected to pool control bus bridge.\r\n" + "UART bytes will be shown here in hex.\r\n" + "Decoded messages will also be shown.\r\n" + "Send hex strings (e.g., '02 00 50 FF FF 03') to transmit to the bus.\r\n\r\n"; + if (!send_to_client(client_sock, hello, strlen(hello))) { + ESP_LOGI(TAG, "Client dropped during greeting"); + tcp_bridge_close_client(&client_sock, &line_pos); + } } } @@ -250,7 +308,10 @@ static void tcp_bridge_task(void *pvParameters) if (client_sock >= 0) { hexLine[hex_pos] = '\r'; hexLine[hex_pos + 1] = '\n'; - send_to_client(client_sock, hexLine, hex_pos + 2); + if (!send_to_client(client_sock, hexLine, hex_pos + 2)) { + // Decoding above already ran; just drop the client. + tcp_bridge_close_client(&client_sock, &line_pos); + } } } else if (s_config.on_resync) { // Map each framing failure to its own resync category so the @@ -290,7 +351,10 @@ static void tcp_bridge_task(void *pvParameters) char c = tcp_buf[i]; // Echo the character back to the client - send_to_client(client_sock, &c, 1); + if (!send_to_client(client_sock, &c, 1)) { + tcp_bridge_close_client(&client_sock, &line_pos); + break; + } if (c == '\n' || c == '\r') { // End of line - process the accumulated command @@ -321,9 +385,10 @@ static void tcp_bridge_task(void *pvParameters) // Parse and send the hex string int sent = s_config.uart_write(line_buf); + bool client_kept = true; if (sent > 0) { const char *ok_msg = "OK - sent\r\n"; - send_to_client(client_sock, ok_msg, strlen(ok_msg)); + client_kept = send_to_client(client_sock, ok_msg, strlen(ok_msg)); // Decode the sent message (will be logged via custom vprintf) if (s_config.decode_message && s_last_tx_len > 0) { @@ -336,11 +401,16 @@ static void tcp_bridge_task(void *pvParameters) } } else { const char *err_msg = "ERROR - invalid hex string\r\n"; - send_to_client(client_sock, err_msg, strlen(err_msg)); + client_kept = send_to_client(client_sock, err_msg, strlen(err_msg)); s_last_tx_len = 0; // Clear on error } line_pos = 0; // Reset for next line + + if (!client_kept) { + tcp_bridge_close_client(&client_sock, &line_pos); + break; + } } } else if (c == 0x08 || c == 0x7F) { // Backspace or delete - remove last character @@ -354,26 +424,22 @@ static void tcp_bridge_task(void *pvParameters) } else { // Buffer full - reset const char *overflow_msg = "\r\nERROR - line too long\r\n"; - send_to_client(client_sock, overflow_msg, strlen(overflow_msg)); + bool kept = send_to_client(client_sock, overflow_msg, strlen(overflow_msg)); line_pos = 0; + if (!kept) { + tcp_bridge_close_client(&client_sock, &line_pos); + break; + } } } } } else if (r == 0) { ESP_LOGI(TAG, "Client disconnected"); - tcp_bridge_set_log_client(-1); // Disable log forwarding - shutdown(client_sock, SHUT_RDWR); - close(client_sock); - client_sock = -1; - line_pos = 0; // Reset line buffer + tcp_bridge_close_client(&client_sock, &line_pos); } else { if (errno != EAGAIN && errno != EWOULDBLOCK) { ESP_LOGW(TAG, "Client recv error: errno %d", errno); - tcp_bridge_set_log_client(-1); // Disable log forwarding - shutdown(client_sock, SHUT_RDWR); - close(client_sock); - client_sock = -1; - line_pos = 0; // Reset line buffer + tcp_bridge_close_client(&client_sock, &line_pos); } } } From f432dfe2ba4e40d0c5f81d30a4296f1926648c4e Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Jun 2026 10:09:45 +0000 Subject: [PATCH 2/3] Add periodic heap-stats logging and home-page Memory row MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The /status JSON already exposed free/min-free heap, but a web readout only helps while the device is still responsive — useless once it has locked up. Add a low-priority heap_monitor task that logs free heap, the minimum-free watermark, and the largest free block every 5 minutes, so a slow leak or growing fragmentation shows up as a trend in the console/TCP log history before it exhausts memory. Largest-free-block specifically surfaces fragmentation (e.g. from repeated MQTT discovery republishes), which total free heap alone can hide. Also surface the heap figures on the home page: the System table now shows a Memory row (free / min-free) built from the memory fields already present in the /status response. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_018oMk4krXAUqhX5gWtvc1mf --- CHANGELOG.md | 1 + CLAUDE.md | 4 +++- main/CMakeLists.txt | 1 + main/heap_monitor.c | 41 +++++++++++++++++++++++++++++++++++++++++ main/heap_monitor.h | 14 ++++++++++++++ main/main.c | 5 +++++ main/web_handlers.c | 5 +++++ 7 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 main/heap_monitor.c create mode 100644 main/heap_monitor.h diff --git a/CHANGELOG.md b/CHANGELOG.md index b5c1bba..9ea6a6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added - Saving of unknown bus messages, added web UI to view them +- Periodic heap-stats logging (free, minimum-free watermark, largest free block) every 5 minutes via a new low-priority `heap_monitor` task, so a slow memory leak or growing fragmentation is visible as a trend in the console/TCP log history before it can exhaust the heap. The home page System table now also shows a Memory row (free / min-free), using the `memory` fields already present in the `/status` JSON. ### Changed - Note about CMD 0x05 observed with payload 0x00. Added to PROTOCOL.md and new sample trace. - Note about CMD 0x12 observed with payload 0x01 0x00. Added to PROTOCOL.md and new sample trace. diff --git a/CLAUDE.md b/CLAUDE.md index 679caac..896e84e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -35,6 +35,7 @@ Requires ESP-IDF v5.5+ with environment sourced (`. $IDF_PATH/export.sh`). - **mqtt_commands.c/.h**: MQTT command subscription and handling - **web_handlers.c/.h**: HTTP server endpoints (status, provisioning, MQTT config) - **led_helper.c/.h**: WS2812 LED control for status indication +- **heap_monitor.c/.h**: Low-priority task that periodically logs heap stats (free, min-free watermark, largest block) for leak/fragmentation diagnostics ### Key Components @@ -84,5 +85,6 @@ main.c │ ├─> mqtt_discovery (Home Assistant integration) │ └─> mqtt_commands (handle MQTT commands) ├─> web_handlers (HTTP API) - └─> led_helper (status LED) + ├─> led_helper (status LED) + └─> heap_monitor (periodic heap-stats logging) ``` diff --git a/main/CMakeLists.txt b/main/CMakeLists.txt index a3b01f4..11a1b5d 100644 --- a/main/CMakeLists.txt +++ b/main/CMakeLists.txt @@ -16,6 +16,7 @@ idf_component_register( "register_requester.c" "unknown_buffer.c" "dns_server.c" + "heap_monitor.c" INCLUDE_DIRS "." EMBED_TXTFILES diff --git a/main/heap_monitor.c b/main/heap_monitor.c new file mode 100644 index 0000000..c781bf4 --- /dev/null +++ b/main/heap_monitor.c @@ -0,0 +1,41 @@ +#include "heap_monitor.h" + +#include "freertos/FreeRTOS.h" +#include "freertos/task.h" +#include "esp_heap_caps.h" +#include "esp_system.h" +#include "esp_log.h" + +static const char *TAG = "HEAP_MON"; + +// How often to log heap stats. Long enough that the overhead (one log line) is +// negligible, short enough to give useful resolution on a multi-day downward +// trend. +#define HEAP_MONITOR_INTERVAL_MS (5 * 60 * 1000) + +static void heap_monitor_task(void *arg) +{ + (void)arg; + for (;;) { + // free_now — heap available right now + // free_min — lowest free heap seen since boot (leak/pressure watermark) + // largest — largest single allocatable block; a gap between this and + // free_now that widens over time indicates fragmentation + uint32_t free_now = esp_get_free_heap_size(); + uint32_t free_min = esp_get_minimum_free_heap_size(); + size_t largest = heap_caps_get_largest_free_block(MALLOC_CAP_8BIT); + + ESP_LOGI(TAG, "heap: free=%lu B, min_free=%lu B, largest_block=%u B", + (unsigned long)free_now, (unsigned long)free_min, (unsigned)largest); + + vTaskDelay(pdMS_TO_TICKS(HEAP_MONITOR_INTERVAL_MS)); + } +} + +void heap_monitor_start(void) +{ + // Low priority (1) and a small stack: this task is never time-critical and + // only ever formats one log line. The stack still needs room for the + // logging path (vprintf plus the TCP log-forward hook's 256-byte buffer). + xTaskCreate(heap_monitor_task, "heap_mon", 3072, NULL, 1, NULL); +} diff --git a/main/heap_monitor.h b/main/heap_monitor.h new file mode 100644 index 0000000..aa831f6 --- /dev/null +++ b/main/heap_monitor.h @@ -0,0 +1,14 @@ +#ifndef HEAP_MONITOR_H +#define HEAP_MONITOR_H + +// Starts a low-priority background task that periodically logs heap statistics +// (current free, minimum-free watermark since boot, and largest free block). +// +// The point is diagnostic history: a slow leak or growing fragmentation shows +// up as a downward trend in free/largest-block over hours or days, visible in +// the console and TCP log stream *before* it exhausts the heap and crashes — +// which is exactly the information you can't recover from a device that has +// already locked up. +void heap_monitor_start(void); + +#endif // HEAP_MONITOR_H diff --git a/main/main.c b/main/main.c index c5976d2..2fc6a97 100644 --- a/main/main.c +++ b/main/main.c @@ -20,6 +20,7 @@ #include "message_decoder.h" #include "register_requester.h" #include "unknown_buffer.h" +#include "heap_monitor.h" // ==================== APPLICATION ===================== // All configuration values are in config.h @@ -216,4 +217,8 @@ void app_main(void) } register_requester_start(&s_pool_state, s_pool_state_mutex); + + // Periodically log heap stats so a slow leak or fragmentation trend is + // visible in the log history before it can exhaust memory and crash. + heap_monitor_start(); } diff --git a/main/web_handlers.c b/main/web_handlers.c index ed387d6..e3738b0 100644 --- a/main/web_handlers.c +++ b/main/web_handlers.c @@ -284,6 +284,11 @@ static esp_err_t home_get_handler(httpd_req_t *req) "const rtr=document.createElement('tr');" "rtr.innerHTML='Resyncs'+(rs.total||0)+(parts.length?' ('+parts.join(', ')+')':'')+'';" "document.getElementById('sys-body').appendChild(rtr);}" + "const mem=data.memory;" + "if(mem){" + "const memtr=document.createElement('tr');" + "memtr.innerHTML='Memory'+mem.free_heap+' free (min '+mem.min_free_heap+')';" + "document.getElementById('sys-body').appendChild(memtr);}" "const tb=document.getElementById('pool-body');" "rows.forEach(([k,v])=>{" "const tr=document.createElement('tr');" From ec5af29e385b6f4b4a14ed7c1e119bed5ad83575 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Jun 2026 10:14:40 +0000 Subject: [PATCH 3/3] Don't stop MQTT from the WiFi event handler on disconnect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit esp_mqtt_client_stop() blocks waiting for the MQTT task to acknowledge. Calling it from WIFI_EVENT_STA_DISCONNECTED runs it on the system event-loop task, so if the MQTT task was itself stuck on a dead socket during the same network outage, the stop would wedge the event loop and stall all further WiFi/IP event processing — a plausible silent-hang path under WiFi flapping. Follow the documented esp-mqtt pattern instead: start the client once on first connectivity and let esp-mqtt manage reconnection internally across WiFi drops and restores. mqtt_client_start() is now idempotent so the GOT_IP handler can keep calling it on every reconnect as a no-op. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_018oMk4krXAUqhX5gWtvc1mf --- CHANGELOG.md | 1 + main/mqtt_poolclient.c | 7 +++++++ main/wifi_provisioning.c | 13 ++++++++++--- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ea6a6d..4216920 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Removed ### Fixed - TCP bridge no longer hangs when a connected client stalls or dies silently. The client socket is now non-blocking with a keep/drop policy on every send (full send keeps the client; a full send buffer drops the message but keeps the connection; a partial write or hard error drops the client), so a stuck peer can never wedge the bridge task — which is also the only task reading the UART, meaning such a wedge previously froze the whole device with no logs and no recovery. Added TCP keepalive (idle 30s, interval 5s, count 3) so a silently dead peer is detected and the single client slot freed. +- MQTT client is no longer stopped from inside the WiFi event handler on disconnect. `esp_mqtt_client_stop()` blocks waiting for the MQTT task, and calling it from the system event-loop task could stall all further WiFi/IP event processing if the MQTT task was itself stuck on a dead socket during the same outage. The client is now started once on first connectivity and left to esp-mqtt's built-in reconnection, which handles WiFi drops/restores on its own. `mqtt_client_start()` is now idempotent so repeated reconnects are no-ops. ### Deprecated ### Security diff --git a/main/mqtt_poolclient.c b/main/mqtt_poolclient.c index 6ab9523..3764be4 100644 --- a/main/mqtt_poolclient.c +++ b/main/mqtt_poolclient.c @@ -332,6 +332,13 @@ esp_err_t mqtt_client_start(void) return ESP_ERR_INVALID_STATE; } + if (s_mqtt_started) { + // Already running. esp-mqtt manages reconnection internally, so this is + // expected to be called again on every WiFi reconnect — no-op instead + // of letting esp_mqtt_client_start() log a "Client has started" error. + return ESP_OK; + } + esp_err_t err = esp_mqtt_client_start(s_mqtt_client); if (err != ESP_OK) { ESP_LOGE(TAG, "Failed to start MQTT client: %s", esp_err_to_name(err)); diff --git a/main/wifi_provisioning.c b/main/wifi_provisioning.c index 025a0ac..db10c14 100644 --- a/main/wifi_provisioning.c +++ b/main/wifi_provisioning.c @@ -162,8 +162,13 @@ static void wifi_event_handler(void *arg, s_device_ip_address[0] = '\0'; xEventGroupClearBits(s_wifi_event_group, WIFI_CONNECTED_BIT); - // Stop MQTT client on WiFi disconnect - mqtt_client_stop(); + // Deliberately do NOT stop the MQTT client here. This handler runs in + // the system event-loop task, and esp_mqtt_client_stop() blocks waiting + // for the MQTT task to acknowledge — if that task is itself stuck on a + // dead socket during the same outage, stopping it here would wedge the + // event loop and stall all further WiFi/IP event processing. esp-mqtt + // detects the broken connection and reconnects on its own once WiFi + // returns, firing MQTT_EVENT_DISCONNECTED (which updates the LED). if (!s_provisioning_active) { s_wifi_retry_count++; @@ -224,7 +229,9 @@ static void wifi_event_handler(void *arg, led_set_connected(); xEventGroupSetBits(s_wifi_event_group, WIFI_CONNECTED_BIT); - // Start MQTT client on WiFi connect + // Start the MQTT client the first time we have connectivity. This is + // idempotent: on later reconnects esp-mqtt is already running and + // managing its own reconnection, so this call is a no-op. mqtt_client_start(); // Start mDNS service for network discovery