Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "a3s-gateway"
version = "1.0.7"
version = "1.0.11"
edition = "2021"
rust-version = "1.88"
authors = ["A3S Lab"]
Expand Down
18 changes: 14 additions & 4 deletions src/provider/kubernetes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -204,10 +204,20 @@ pub fn ingress_to_config(ingresses: &[IngressResource]) -> GatewayConfig {
};

for path in &http.paths {
let svc_name = format!(
"{}-{}-{}",
ingress.namespace, ingress.name, path.backend.service.name
);
// Router/service key. When the Ingress name already equals the
// backend Service name (the image-app-publish convention: both
// are the app/release name), concatenating all three segments
// yields a redundant doubled key like `default-arche-arche`.
// Collapse the duplicate so it reads `default-arche` while
// staying unique (Ingress names are unique per namespace).
let svc_name = if ingress.name == path.backend.service.name {
format!("{}-{}", ingress.namespace, ingress.name)
} else {
format!(
"{}-{}-{}",
ingress.namespace, ingress.name, path.backend.service.name
)
};

// Build service with backend URL
let port = if path.backend.service.port.number > 0 {
Expand Down
17 changes: 14 additions & 3 deletions src/proxy/http_proxy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,27 @@ impl HttpProxy {
pub fn with_timeout(timeout: Duration) -> Self {
let mut connector = HttpConnector::new();
connector.set_nodelay(true);
connector.set_keepalive(Some(Duration::from_secs(90)));
// TCP keepalive 15s (was 90s): detect a dead upstream (e.g. a backend pod
// terminated during a K8s rollout) and tear the socket down promptly.
connector.set_keepalive(Some(Duration::from_secs(15)));
connector.set_reuse_address(true);

// pool_idle_timeout 5s (was 90s): hyper keys the idle connection pool by hostname,
// NOT by resolved IP. When a backend pod rolls (Deployment rollout → new pod IP),
// pooled keep-alive sockets to the OLD pod IP linger and get reused → SendRequest
// fails → passive-health marks the backend unhealthy → the half-open recovery probe
// reuses ANOTHER stale socket → permanent 503 "No healthy backends" until the gateway
// is restarted. Evicting idle sockets after 5s (well under passive-health
// recovery_time, 10s) guarantees the half-open probe opens a FRESH connection that
// re-resolves DNS to the new pod IP — so the gateway self-heals after a rollout
// instead of requiring a manual restart.
let client = Client::builder(TokioExecutor::new())
.pool_idle_timeout(Duration::from_secs(90))
.pool_idle_timeout(Duration::from_secs(5))
.pool_max_idle_per_host(200)
.build(connector.clone());

let stream_client = Client::builder(TokioExecutor::new())
.pool_idle_timeout(Duration::from_secs(90))
.pool_idle_timeout(Duration::from_secs(5))
.pool_max_idle_per_host(200)
.build(connector);

Expand Down
22 changes: 18 additions & 4 deletions src/proxy/streaming.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,21 @@ use std::time::Duration;
/// Shared reqwest client for streaming requests — reuses connection pool across calls
static STREAMING_CLIENT: OnceLock<reqwest::Client> = OnceLock::new();

/// Idle read timeout for streaming/SSE responses. This is the max gap the
/// upstream may go silent — NOT a total-request deadline. The API emits an SSE
/// keep-alive every ~10s, so a healthy stream never trips it and can run
/// indefinitely; only a genuinely dead upstream is reaped after this window.
const STREAM_IDLE_TIMEOUT_SECS: u64 = 300;

fn streaming_client() -> &'static reqwest::Client {
STREAMING_CLIENT.get_or_init(|| {
reqwest::Client::builder()
.pool_max_idle_per_host(100)
// read_timeout = per-read (idle) timeout, reset on every byte —
// unlike .timeout()/RequestBuilder::timeout which caps the *whole*
// request including the streamed body and hard-killed every SSE
// stream after 5 minutes regardless of activity.
.read_timeout(Duration::from_secs(STREAM_IDLE_TIMEOUT_SECS))
.build()
.unwrap_or_default()
})
Expand Down Expand Up @@ -90,10 +101,13 @@ pub async fn forward_streaming(
let path_and_query = uri.path_and_query().map(|pq| pq.as_str()).unwrap_or("/");
let upstream_url = format!("{}{}", backend_url, path_and_query);

// Reuse shared client — connection pool survives across streaming requests
let mut req_builder = streaming_client()
.request(method.clone(), &upstream_url)
.timeout(Duration::from_secs(timeout_secs));
// Reuse shared client — its read_timeout (idle, see STREAM_IDLE_TIMEOUT_SECS)
// governs streaming liveness. Deliberately NO per-request .timeout() here:
// reqwest's total-request timeout would cap the whole streamed body and
// hard-kill every SSE/chunked response after `timeout_secs` regardless of
// activity (that was the 5-minute SSE cutoff). `timeout_secs` is still used
// below to label an UpstreamTimeout if the initial response never arrives.
let mut req_builder = streaming_client().request(method.clone(), &upstream_url);

// Forward headers (skip hop-by-hop) — eq_ignore_ascii_case avoids to_lowercase() alloc
for (key, value) in headers.iter() {
Expand Down
8 changes: 6 additions & 2 deletions src/service/passive_health.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,11 @@ impl Default for PassiveHealthConfig {
error_threshold: 5,
window: Duration::from_secs(30),
error_status_codes: vec![500, 502, 503, 504],
recovery_time: Duration::from_secs(30),
// 10s (was 30s): re-probe a backend ~10s after marking it unhealthy. Combined
// with the 5s upstream pool idle timeout (see proxy/http_proxy.rs), the half-open
// probe lands on a FRESH connection to the rolled pod's new IP — bounding
// rollout-induced 503s to ~10s of auto-recovery instead of a permanent outage.
recovery_time: Duration::from_secs(10),
}
}
}
Expand Down Expand Up @@ -269,7 +273,7 @@ mod tests {
assert_eq!(config.error_threshold, 5);
assert_eq!(config.window, Duration::from_secs(30));
assert_eq!(config.error_status_codes, vec![500, 502, 503, 504]);
assert_eq!(config.recovery_time, Duration::from_secs(30));
assert_eq!(config.recovery_time, Duration::from_secs(10));
}

// --- Construction ---
Expand Down