A3S-Lab · ZhiXiao-Lin · Jun 10, 2026 · Jun 22, 2026 · Jun 22, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "a3s-gateway"
-version = "1.0.7"
+version = "1.0.11"
 edition = "2021"
 rust-version = "1.88"
 authors = ["A3S Lab"]

diff --git a/src/provider/kubernetes.rs b/src/provider/kubernetes.rs
@@ -204,10 +204,20 @@ pub fn ingress_to_config(ingresses: &[IngressResource]) -> GatewayConfig {
             };
 
             for path in &http.paths {
-                let svc_name = format!(
-                    "{}-{}-{}",
-                    ingress.namespace, ingress.name, path.backend.service.name
-                );
+                // Router/service key. When the Ingress name already equals the
+                // backend Service name (the image-app-publish convention: both
+                // are the app/release name), concatenating all three segments
+                // yields a redundant doubled key like `default-arche-arche`.
+                // Collapse the duplicate so it reads `default-arche` while
+                // staying unique (Ingress names are unique per namespace).
+                let svc_name = if ingress.name == path.backend.service.name {
+                    format!("{}-{}", ingress.namespace, ingress.name)
+                } else {
+                    format!(
+                        "{}-{}-{}",
+                        ingress.namespace, ingress.name, path.backend.service.name
+                    )
+                };
 
                 // Build service with backend URL
                 let port = if path.backend.service.port.number > 0 {

diff --git a/src/proxy/http_proxy.rs b/src/proxy/http_proxy.rs
@@ -30,16 +30,27 @@ impl HttpProxy {
     pub fn with_timeout(timeout: Duration) -> Self {
         let mut connector = HttpConnector::new();
         connector.set_nodelay(true);
-        connector.set_keepalive(Some(Duration::from_secs(90)));
+        // TCP keepalive 15s (was 90s): detect a dead upstream (e.g. a backend pod
+        // terminated during a K8s rollout) and tear the socket down promptly.
+        connector.set_keepalive(Some(Duration::from_secs(15)));
         connector.set_reuse_address(true);
 
+        // pool_idle_timeout 5s (was 90s): hyper keys the idle connection pool by hostname,
+        // NOT by resolved IP. When a backend pod rolls (Deployment rollout → new pod IP),
+        // pooled keep-alive sockets to the OLD pod IP linger and get reused → SendRequest
+        // fails → passive-health marks the backend unhealthy → the half-open recovery probe
+        // reuses ANOTHER stale socket → permanent 503 "No healthy backends" until the gateway
+        // is restarted. Evicting idle sockets after 5s (well under passive-health
+        // recovery_time, 10s) guarantees the half-open probe opens a FRESH connection that
+        // re-resolves DNS to the new pod IP — so the gateway self-heals after a rollout
+        // instead of requiring a manual restart.
         let client = Client::builder(TokioExecutor::new())
-            .pool_idle_timeout(Duration::from_secs(90))
+            .pool_idle_timeout(Duration::from_secs(5))
             .pool_max_idle_per_host(200)
             .build(connector.clone());
 
         let stream_client = Client::builder(TokioExecutor::new())
-            .pool_idle_timeout(Duration::from_secs(90))
+            .pool_idle_timeout(Duration::from_secs(5))
             .pool_max_idle_per_host(200)
             .build(connector);
 

diff --git a/src/proxy/streaming.rs b/src/proxy/streaming.rs
@@ -12,10 +12,21 @@ use std::time::Duration;
 /// Shared reqwest client for streaming requests — reuses connection pool across calls
 static STREAMING_CLIENT: OnceLock<reqwest::Client> = OnceLock::new();
 
+/// Idle read timeout for streaming/SSE responses. This is the max gap the
+/// upstream may go silent — NOT a total-request deadline. The API emits an SSE
+/// keep-alive every ~10s, so a healthy stream never trips it and can run
+/// indefinitely; only a genuinely dead upstream is reaped after this window.
+const STREAM_IDLE_TIMEOUT_SECS: u64 = 300;
+
 fn streaming_client() -> &'static reqwest::Client {
     STREAMING_CLIENT.get_or_init(|| {
         reqwest::Client::builder()
             .pool_max_idle_per_host(100)
+            // read_timeout = per-read (idle) timeout, reset on every byte —
+            // unlike .timeout()/RequestBuilder::timeout which caps the *whole*
+            // request including the streamed body and hard-killed every SSE
+            // stream after 5 minutes regardless of activity.
+            .read_timeout(Duration::from_secs(STREAM_IDLE_TIMEOUT_SECS))
             .build()
             .unwrap_or_default()
     })
@@ -90,10 +101,13 @@ pub async fn forward_streaming(
     let path_and_query = uri.path_and_query().map(|pq| pq.as_str()).unwrap_or("/");
     let upstream_url = format!("{}{}", backend_url, path_and_query);
 
-    // Reuse shared client — connection pool survives across streaming requests
-    let mut req_builder = streaming_client()
-        .request(method.clone(), &upstream_url)
-        .timeout(Duration::from_secs(timeout_secs));
+    // Reuse shared client — its read_timeout (idle, see STREAM_IDLE_TIMEOUT_SECS)
+    // governs streaming liveness. Deliberately NO per-request .timeout() here:
+    // reqwest's total-request timeout would cap the whole streamed body and
+    // hard-kill every SSE/chunked response after `timeout_secs` regardless of
+    // activity (that was the 5-minute SSE cutoff). `timeout_secs` is still used
+    // below to label an UpstreamTimeout if the initial response never arrives.
+    let mut req_builder = streaming_client().request(method.clone(), &upstream_url);
 
     // Forward headers (skip hop-by-hop) — eq_ignore_ascii_case avoids to_lowercase() alloc
     for (key, value) in headers.iter() {

diff --git a/src/service/passive_health.rs b/src/service/passive_health.rs
@@ -28,7 +28,11 @@ impl Default for PassiveHealthConfig {
             error_threshold: 5,
             window: Duration::from_secs(30),
             error_status_codes: vec![500, 502, 503, 504],
-            recovery_time: Duration::from_secs(30),
+            // 10s (was 30s): re-probe a backend ~10s after marking it unhealthy. Combined
+            // with the 5s upstream pool idle timeout (see proxy/http_proxy.rs), the half-open
+            // probe lands on a FRESH connection to the rolled pod's new IP — bounding
+            // rollout-induced 503s to ~10s of auto-recovery instead of a permanent outage.
+            recovery_time: Duration::from_secs(10),
         }
     }
 }
@@ -269,7 +273,7 @@ mod tests {
         assert_eq!(config.error_threshold, 5);
         assert_eq!(config.window, Duration::from_secs(30));
         assert_eq!(config.error_status_codes, vec![500, 502, 503, 504]);
-        assert_eq!(config.recovery_time, Duration::from_secs(30));
+        assert_eq!(config.recovery_time, Duration::from_secs(10));
     }
 
     // --- Construction ---