diff --git a/CHANGELOG.md b/CHANGELOG.md index 974f3f3..4216920 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added - Saving of unknown bus messages, added web UI to view them +- Periodic heap-stats logging (free, minimum-free watermark, largest free block) every 5 minutes via a new low-priority `heap_monitor` task, so a slow memory leak or growing fragmentation is visible as a trend in the console/TCP log history before it can exhaust the heap. The home page System table now also shows a Memory row (free / min-free), using the `memory` fields already present in the `/status` JSON. ### Changed - Note about CMD 0x05 observed with payload 0x00. Added to PROTOCOL.md and new sample trace. - Note about CMD 0x12 observed with payload 0x01 0x00. Added to PROTOCOL.md and new sample trace. @@ -31,6 +32,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Frame parser and decoder now also accept "discovery" packets (control bytes `0x00 0x00`): a shorter 11-byte frame shape with no payload and no data-checksum byte, alongside the existing `0x80 0x00` data packets ### Removed ### Fixed +- TCP bridge no longer hangs when a connected client stalls or dies silently. The client socket is now non-blocking with a keep/drop policy on every send (full send keeps the client; a full send buffer drops the message but keeps the connection; a partial write or hard error drops the client), so a stuck peer can never wedge the bridge task — which is also the only task reading the UART, meaning such a wedge previously froze the whole device with no logs and no recovery. Added TCP keepalive (idle 30s, interval 5s, count 3) so a silently dead peer is detected and the single client slot freed. +- MQTT client is no longer stopped from inside the WiFi event handler on disconnect. `esp_mqtt_client_stop()` blocks waiting for the MQTT task, and calling it from the system event-loop task could stall all further WiFi/IP event processing if the MQTT task was itself stuck on a dead socket during the same outage. The client is now started once on first connectivity and left to esp-mqtt's built-in reconnection, which handles WiFi drops/restores on its own. `mqtt_client_start()` is now idempotent so repeated reconnects are no-ops. ### Deprecated ### Security diff --git a/CLAUDE.md b/CLAUDE.md index 679caac..896e84e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -35,6 +35,7 @@ Requires ESP-IDF v5.5+ with environment sourced (`. $IDF_PATH/export.sh`). - **mqtt_commands.c/.h**: MQTT command subscription and handling - **web_handlers.c/.h**: HTTP server endpoints (status, provisioning, MQTT config) - **led_helper.c/.h**: WS2812 LED control for status indication +- **heap_monitor.c/.h**: Low-priority task that periodically logs heap stats (free, min-free watermark, largest block) for leak/fragmentation diagnostics ### Key Components @@ -84,5 +85,6 @@ main.c │ ├─> mqtt_discovery (Home Assistant integration) │ └─> mqtt_commands (handle MQTT commands) ├─> web_handlers (HTTP API) - └─> led_helper (status LED) + ├─> led_helper (status LED) + └─> heap_monitor (periodic heap-stats logging) ``` diff --git a/main/CMakeLists.txt b/main/CMakeLists.txt index a3b01f4..11a1b5d 100644 --- a/main/CMakeLists.txt +++ b/main/CMakeLists.txt @@ -16,6 +16,7 @@ idf_component_register( "register_requester.c" "unknown_buffer.c" "dns_server.c" + "heap_monitor.c" INCLUDE_DIRS "." EMBED_TXTFILES diff --git a/main/config.h b/main/config.h index 5256020..77a8704 100644 --- a/main/config.h +++ b/main/config.h @@ -101,6 +101,13 @@ #define TCP_TASK_STACK_SIZE 8192 // TCP bridge task stack size #define TCP_TASK_PRIORITY 5 // TCP bridge task priority +// TCP keepalive — reap a silently dead client (one that never sends FIN/RST) +// so the single client slot is freed. Worst-case detection time is roughly +// IDLE + INTERVAL * COUNT seconds. +#define TCP_KEEPALIVE_IDLE_SEC 30 // Idle seconds before the first keepalive probe +#define TCP_KEEPALIVE_INTERVAL_SEC 5 // Seconds between probes +#define TCP_KEEPALIVE_COUNT 3 // Failed probes before dropping the connection + // ====================================================== // Buffer Sizes // ====================================================== diff --git a/main/heap_monitor.c b/main/heap_monitor.c new file mode 100644 index 0000000..c781bf4 --- /dev/null +++ b/main/heap_monitor.c @@ -0,0 +1,41 @@ +#include "heap_monitor.h" + +#include "freertos/FreeRTOS.h" +#include "freertos/task.h" +#include "esp_heap_caps.h" +#include "esp_system.h" +#include "esp_log.h" + +static const char *TAG = "HEAP_MON"; + +// How often to log heap stats. Long enough that the overhead (one log line) is +// negligible, short enough to give useful resolution on a multi-day downward +// trend. +#define HEAP_MONITOR_INTERVAL_MS (5 * 60 * 1000) + +static void heap_monitor_task(void *arg) +{ + (void)arg; + for (;;) { + // free_now — heap available right now + // free_min — lowest free heap seen since boot (leak/pressure watermark) + // largest — largest single allocatable block; a gap between this and + // free_now that widens over time indicates fragmentation + uint32_t free_now = esp_get_free_heap_size(); + uint32_t free_min = esp_get_minimum_free_heap_size(); + size_t largest = heap_caps_get_largest_free_block(MALLOC_CAP_8BIT); + + ESP_LOGI(TAG, "heap: free=%lu B, min_free=%lu B, largest_block=%u B", + (unsigned long)free_now, (unsigned long)free_min, (unsigned)largest); + + vTaskDelay(pdMS_TO_TICKS(HEAP_MONITOR_INTERVAL_MS)); + } +} + +void heap_monitor_start(void) +{ + // Low priority (1) and a small stack: this task is never time-critical and + // only ever formats one log line. The stack still needs room for the + // logging path (vprintf plus the TCP log-forward hook's 256-byte buffer). + xTaskCreate(heap_monitor_task, "heap_mon", 3072, NULL, 1, NULL); +} diff --git a/main/heap_monitor.h b/main/heap_monitor.h new file mode 100644 index 0000000..aa831f6 --- /dev/null +++ b/main/heap_monitor.h @@ -0,0 +1,14 @@ +#ifndef HEAP_MONITOR_H +#define HEAP_MONITOR_H + +// Starts a low-priority background task that periodically logs heap statistics +// (current free, minimum-free watermark since boot, and largest free block). +// +// The point is diagnostic history: a slow leak or growing fragmentation shows +// up as a downward trend in free/largest-block over hours or days, visible in +// the console and TCP log stream *before* it exhausts the heap and crashes — +// which is exactly the information you can't recover from a device that has +// already locked up. +void heap_monitor_start(void); + +#endif // HEAP_MONITOR_H diff --git a/main/main.c b/main/main.c index c5976d2..2fc6a97 100644 --- a/main/main.c +++ b/main/main.c @@ -20,6 +20,7 @@ #include "message_decoder.h" #include "register_requester.h" #include "unknown_buffer.h" +#include "heap_monitor.h" // ==================== APPLICATION ===================== // All configuration values are in config.h @@ -216,4 +217,8 @@ void app_main(void) } register_requester_start(&s_pool_state, s_pool_state_mutex); + + // Periodically log heap stats so a slow leak or fragmentation trend is + // visible in the log history before it can exhaust memory and crash. + heap_monitor_start(); } diff --git a/main/mqtt_poolclient.c b/main/mqtt_poolclient.c index 6ab9523..3764be4 100644 --- a/main/mqtt_poolclient.c +++ b/main/mqtt_poolclient.c @@ -332,6 +332,13 @@ esp_err_t mqtt_client_start(void) return ESP_ERR_INVALID_STATE; } + if (s_mqtt_started) { + // Already running. esp-mqtt manages reconnection internally, so this is + // expected to be called again on every WiFi reconnect — no-op instead + // of letting esp_mqtt_client_start() log a "Client has started" error. + return ESP_OK; + } + esp_err_t err = esp_mqtt_client_start(s_mqtt_client); if (err != ESP_OK) { ESP_LOGE(TAG, "Failed to start MQTT client: %s", esp_err_to_name(err)); diff --git a/main/tcp_bridge.c b/main/tcp_bridge.c index 285ee36..19a8c95 100644 --- a/main/tcp_bridge.c +++ b/main/tcp_bridge.c @@ -53,16 +53,37 @@ static SemaphoreHandle_t s_stopped_sem = NULL; * Send data to the TCP client under the client mutex. * All sends to client_sock must go through this to prevent interleaving * with the log vprintf callback, which can fire from any task at any time. + * + * The client socket is non-blocking, so a stalled or dead peer can never wedge + * the bridge task. Returns true if the client should be kept, false if it + * should be dropped: + * - full send -> keep + * - buffer full (EAGAIN) -> message dropped, keep. The hex stream is a + * best-effort debug feed; a truly dead peer is + * reaped by TCP keepalive / the recv path. + * - partial write -> drop, since the hex stream is now desynced and a + * clean reconnect is the safe recovery. + * - other send error -> drop. + * Mutex contention is never the client's fault, so it keeps the client. */ -static void send_to_client(int sock, const void *data, int len) +static bool send_to_client(int sock, const void *data, int len) { if (sock < 0 || len <= 0) { - return; + return true; } + bool keep = true; if (s_log_mutex && xSemaphoreTake(s_log_mutex, pdMS_TO_TICKS(MUTEX_TIMEOUT_MS)) == pdTRUE) { - send(sock, data, len, 0); + int sent = send(sock, data, len, MSG_DONTWAIT); + if (sent < 0) { + if (errno != EAGAIN && errno != EWOULDBLOCK) { + keep = false; + } + } else if (sent < len) { + keep = false; + } xSemaphoreGive(s_log_mutex); } + return keep; } /** @@ -109,6 +130,21 @@ static void tcp_bridge_set_log_client(int sock) } } +/** + * Tear down the current TCP client: disable log forwarding, close the socket, + * and reset the per-client line buffer. Safe to call with no client connected. + */ +static void tcp_bridge_close_client(int *client_sock, int *line_pos) +{ + tcp_bridge_set_log_client(-1); // Disable log forwarding before closing the fd + if (*client_sock >= 0) { + shutdown(*client_sock, SHUT_RDWR); + close(*client_sock); + } + *client_sock = -1; + *line_pos = 0; +} + /** * TCP server task implementation @@ -179,18 +215,40 @@ static void tcp_bridge_task(void *pvParameters) inet_ntoa_r(client_addr.sin_addr, addr_str, sizeof(addr_str) - 1); ESP_LOGI(TAG, "Client connected from %s", addr_str); - const char *hello = - "Connected to pool control bus bridge.\r\n" - "UART bytes will be shown here in hex.\r\n" - "Decoded messages will also be shown.\r\n" - "Send hex strings (e.g., '02 00 50 FF FF 03') to transmit to the bus.\r\n\r\n"; - send_to_client(client_sock, hello, strlen(hello)); + // Make the client socket non-blocking so a stalled or dead peer + // can never wedge the bridge task — which is also the only task + // reading the UART. A full send buffer now returns immediately + // instead of blocking the task indefinitely. + int cflags = fcntl(client_sock, F_GETFL, 0); + fcntl(client_sock, F_SETFL, cflags | O_NONBLOCK); + + // Enable TCP keepalive so a silently dead peer (one that never + // sends FIN/RST) is detected and the single client slot freed, + // without relying on the peer to send us anything. + int ka_enable = 1; + int ka_idle = TCP_KEEPALIVE_IDLE_SEC; + int ka_intvl = TCP_KEEPALIVE_INTERVAL_SEC; + int ka_count = TCP_KEEPALIVE_COUNT; + setsockopt(client_sock, SOL_SOCKET, SO_KEEPALIVE, &ka_enable, sizeof(ka_enable)); + setsockopt(client_sock, IPPROTO_TCP, TCP_KEEPIDLE, &ka_idle, sizeof(ka_idle)); + setsockopt(client_sock, IPPROTO_TCP, TCP_KEEPINTVL, &ka_intvl, sizeof(ka_intvl)); + setsockopt(client_sock, IPPROTO_TCP, TCP_KEEPCNT, &ka_count, sizeof(ka_count)); // Enable log forwarding for this client tcp_bridge_set_log_client(client_sock); // Reset line buffer for new client line_pos = 0; + + const char *hello = + "Connected to pool control bus bridge.\r\n" + "UART bytes will be shown here in hex.\r\n" + "Decoded messages will also be shown.\r\n" + "Send hex strings (e.g., '02 00 50 FF FF 03') to transmit to the bus.\r\n\r\n"; + if (!send_to_client(client_sock, hello, strlen(hello))) { + ESP_LOGI(TAG, "Client dropped during greeting"); + tcp_bridge_close_client(&client_sock, &line_pos); + } } } @@ -250,7 +308,10 @@ static void tcp_bridge_task(void *pvParameters) if (client_sock >= 0) { hexLine[hex_pos] = '\r'; hexLine[hex_pos + 1] = '\n'; - send_to_client(client_sock, hexLine, hex_pos + 2); + if (!send_to_client(client_sock, hexLine, hex_pos + 2)) { + // Decoding above already ran; just drop the client. + tcp_bridge_close_client(&client_sock, &line_pos); + } } } else if (s_config.on_resync) { // Map each framing failure to its own resync category so the @@ -290,7 +351,10 @@ static void tcp_bridge_task(void *pvParameters) char c = tcp_buf[i]; // Echo the character back to the client - send_to_client(client_sock, &c, 1); + if (!send_to_client(client_sock, &c, 1)) { + tcp_bridge_close_client(&client_sock, &line_pos); + break; + } if (c == '\n' || c == '\r') { // End of line - process the accumulated command @@ -321,9 +385,10 @@ static void tcp_bridge_task(void *pvParameters) // Parse and send the hex string int sent = s_config.uart_write(line_buf); + bool client_kept = true; if (sent > 0) { const char *ok_msg = "OK - sent\r\n"; - send_to_client(client_sock, ok_msg, strlen(ok_msg)); + client_kept = send_to_client(client_sock, ok_msg, strlen(ok_msg)); // Decode the sent message (will be logged via custom vprintf) if (s_config.decode_message && s_last_tx_len > 0) { @@ -336,11 +401,16 @@ static void tcp_bridge_task(void *pvParameters) } } else { const char *err_msg = "ERROR - invalid hex string\r\n"; - send_to_client(client_sock, err_msg, strlen(err_msg)); + client_kept = send_to_client(client_sock, err_msg, strlen(err_msg)); s_last_tx_len = 0; // Clear on error } line_pos = 0; // Reset for next line + + if (!client_kept) { + tcp_bridge_close_client(&client_sock, &line_pos); + break; + } } } else if (c == 0x08 || c == 0x7F) { // Backspace or delete - remove last character @@ -354,26 +424,22 @@ static void tcp_bridge_task(void *pvParameters) } else { // Buffer full - reset const char *overflow_msg = "\r\nERROR - line too long\r\n"; - send_to_client(client_sock, overflow_msg, strlen(overflow_msg)); + bool kept = send_to_client(client_sock, overflow_msg, strlen(overflow_msg)); line_pos = 0; + if (!kept) { + tcp_bridge_close_client(&client_sock, &line_pos); + break; + } } } } } else if (r == 0) { ESP_LOGI(TAG, "Client disconnected"); - tcp_bridge_set_log_client(-1); // Disable log forwarding - shutdown(client_sock, SHUT_RDWR); - close(client_sock); - client_sock = -1; - line_pos = 0; // Reset line buffer + tcp_bridge_close_client(&client_sock, &line_pos); } else { if (errno != EAGAIN && errno != EWOULDBLOCK) { ESP_LOGW(TAG, "Client recv error: errno %d", errno); - tcp_bridge_set_log_client(-1); // Disable log forwarding - shutdown(client_sock, SHUT_RDWR); - close(client_sock); - client_sock = -1; - line_pos = 0; // Reset line buffer + tcp_bridge_close_client(&client_sock, &line_pos); } } } diff --git a/main/web_handlers.c b/main/web_handlers.c index ed387d6..e3738b0 100644 --- a/main/web_handlers.c +++ b/main/web_handlers.c @@ -284,6 +284,11 @@ static esp_err_t home_get_handler(httpd_req_t *req) "const rtr=document.createElement('tr');" "rtr.innerHTML='