diff --git a/.tekton/integration-tests/pipeline/lightspeed-stack-integration-test.yaml b/.tekton/integration-tests/pipeline/lightspeed-stack-integration-test.yaml index 3110ccea7..266ea1b96 100644 --- a/.tekton/integration-tests/pipeline/lightspeed-stack-integration-test.yaml +++ b/.tekton/integration-tests/pipeline/lightspeed-stack-integration-test.yaml @@ -167,6 +167,8 @@ spec: echo "========== End parameters ==========" - name: lightspeed-stack-integration-tests description: Task to run integration tests from lightspeed-stack repository + # Full Behave suite (proxy + tls) can exceed 2h; needs PipelineRun timeouts >= this value. + timeout: 3h params: - name: SNAPSHOT value: $(params.SNAPSHOT) diff --git a/tests/e2e-prow/rhoai/manifests/lightspeed/e2e-mock-tls-inference.yaml b/tests/e2e-prow/rhoai/manifests/lightspeed/e2e-mock-tls-inference.yaml new file mode 100644 index 000000000..d1b908eb4 --- /dev/null +++ b/tests/e2e-prow/rhoai/manifests/lightspeed/e2e-mock-tls-inference.yaml @@ -0,0 +1,104 @@ +# Mock HTTPS OpenAI API for tls-*.feature (Konflux / Prow; no Docker Compose). +# Llama Stack run.yaml uses https://e2e-mock-tls-inference..svc.cluster.local:8443|8444|8445/v1 +apiVersion: v1 +kind: Pod +metadata: + name: e2e-mock-tls-inference + labels: + app: e2e-mock-tls-inference +spec: + securityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + containers: + - name: e2e-mock-tls-inference + image: python:3.12-slim + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + env: + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: PYTHONPATH + value: /app:/tmp/pydeps + command: + - /bin/sh + - -c + - | + set -e + pip install --quiet --no-cache-dir --target /tmp/pydeps 'trustme>=1.2.1' 'cryptography>=42.0.0' + NS="${POD_NAMESPACE:-default}" + export TLS_CERT_DNS_NAMES="mock-tls-inference,localhost,127.0.0.1,e2e-mock-tls-inference,e2e-mock-tls-inference.${NS}.svc.cluster.local" + exec python /app/server.py + ports: + - containerPort: 8443 + name: tls + - containerPort: 8444 + name: mtls + - containerPort: 8445 + name: mismatch + volumeMounts: + - name: server-script + mountPath: /app/server.py + subPath: server.py + readOnly: true + - name: certs-work + mountPath: /certs + readinessProbe: + exec: + command: + - python3 + - -c + - | + import ssl, urllib.request + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + urllib.request.urlopen("https://localhost:8443/health", context=ctx) + initialDelaySeconds: 8 + periodSeconds: 5 + livenessProbe: + exec: + command: + - python3 + - -c + - | + import ssl, urllib.request + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + urllib.request.urlopen("https://localhost:8443/health", context=ctx) + initialDelaySeconds: 15 + periodSeconds: 20 + volumes: + - name: server-script + configMap: + name: e2e-mock-tls-inference-script + - name: certs-work + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: e2e-mock-tls-inference +spec: + selector: + app: e2e-mock-tls-inference + ports: + - name: tls + port: 8443 + targetPort: tls + - name: mtls + port: 8444 + targetPort: mtls + - name: mismatch + port: 8445 + targetPort: mismatch diff --git a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-openai.yaml b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-openai.yaml index db166d0eb..6508cec35 100644 --- a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-openai.yaml +++ b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-openai.yaml @@ -38,6 +38,28 @@ spec: - -c - | set -e + # Fast-path: PVC already has a valid venv from a previous pod creation in this pipeline run. + # TLS scenarios delete+recreate this pod up to 16 times; skipping the expensive install + # reduces per-restart time from ~6-15 min to ~30-90 s (just RAG seed refresh + chown). + if [[ -d /opt/app-root/.venv ]] \ + && /opt/app-root/.venv/bin/python --version >/dev/null 2>&1 \ + && [[ -d /opt/app-root/src ]]; then + echo "PVC cache hit: app-root already provisioned — skipping full install" + mkdir -p /opt/app-root/.e2e-rag-seed /opt/app-root/src/.llama/storage/rag /opt/app-root/src/.llama/storage/files + if [[ -f /rag-seed/kv_store.db.gz ]]; then + gzip -dc /rag-seed/kv_store.db.gz > /opt/app-root/.e2e-rag-seed/kv_store.db + _sz=$(stat -c%s /opt/app-root/.e2e-rag-seed/kv_store.db) + if [[ "${_sz}" -lt 1048576 ]]; then + echo "FATAL: RAG seed too small (${_sz} bytes); check rag-data ConfigMap" + exit 1 + fi + cp -f /opt/app-root/.e2e-rag-seed/kv_store.db /opt/app-root/src/.llama/storage/rag/kv_store.db + fi + chmod -R 775 /opt/app-root && chown -R 1001:0 /opt/app-root + echo "PVC fast-path complete" + exit 0 + fi + # Full provisioning (PVC is empty — first pod creation this pipeline run). REPO_URL="${REPO_URL:-https://github.com/lightspeed-core/lightspeed-stack.git}" REPO_REVISION="${REPO_REVISION:-main}" case "$REPO_URL" in git@github.com:*) REPO_URL="https://github.com/${REPO_URL#git@github.com:}"; esac @@ -201,9 +223,14 @@ spec: mountPath: /tmp/interception-proxy-ca.pem subPath: ca.pem readOnly: true + # tls-*.feature: client/CA PEMs from Secret e2e-mock-tls-certs (optional). + - name: mock-tls-certs + mountPath: /certs + readOnly: true volumes: - name: app-root - emptyDir: {} + persistentVolumeClaim: + claimName: llama-stack-app-root - name: config-cm configMap: name: llama-stack-config @@ -217,3 +244,7 @@ spec: secret: secretName: e2e-interception-proxy-ca optional: true + - name: mock-tls-certs + secret: + secretName: e2e-mock-tls-certs + optional: true diff --git a/tests/e2e-prow/rhoai/pipeline-konflux.sh b/tests/e2e-prow/rhoai/pipeline-konflux.sh index dbd88fc43..22de47ba1 100755 --- a/tests/e2e-prow/rhoai/pipeline-konflux.sh +++ b/tests/e2e-prow/rhoai/pipeline-konflux.sh @@ -143,6 +143,25 @@ log "✅ Mock servers deployed" #======================================== progress "Deploying lightspeed-stack and llama-stack" +# PVC for llama-stack app-root: caches dnf/uv/git install so TLS per-scenario pod +# recreates skip the expensive init (~6-15 min → ~1-2 min). Delete first to guarantee +# a fresh checkout for this pipeline revision; re-create immediately so the pod can bind. +log "Recreating llama-stack-app-root PVC (fresh per pipeline run)..." +oc delete pvc llama-stack-app-root -n "$NAMESPACE" --ignore-not-found=true 2>/dev/null || true +cat <<'EOF' | oc apply -n "$NAMESPACE" -f - +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: llama-stack-app-root +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi +EOF +log "✅ llama-stack-app-root PVC created" + # Llama run config: single source with GitHub E2E (tests/e2e/configs/run-ci.yaml). # Lightspeed stack: same tree as local/docker E2E (tests/e2e/configuration/server-mode). oc create configmap llama-stack-config -n "$NAMESPACE" \ diff --git a/tests/e2e-prow/rhoai/pipeline-services-konflux.sh b/tests/e2e-prow/rhoai/pipeline-services-konflux.sh index 68eb61050..270d2bffd 100755 --- a/tests/e2e-prow/rhoai/pipeline-services-konflux.sh +++ b/tests/e2e-prow/rhoai/pipeline-services-konflux.sh @@ -26,9 +26,25 @@ oc create secret generic llama-stack-ip-secret \ -n "$NAMESPACE" \ --dry-run=client -o yaml | oc apply -f - +# PVC must exist before the pod (pipeline-konflux.sh creates it; guard here for standalone use). +oc get pvc llama-stack-app-root -n "$NAMESPACE" >/dev/null 2>&1 || \ + oc apply -n "$NAMESPACE" -f - <<'PVCEOF' +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: llama-stack-app-root +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi +PVCEOF + timeout 120 oc delete pod llama-stack-service -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || true oc apply -n "$NAMESPACE" -f "$BASE_DIR/manifests/lightspeed/llama-stack-openai.yaml" -oc wait pod/llama-stack-service -n "$NAMESPACE" --for=condition=Ready --timeout=600s +# First boot runs the full init (dnf + git clone + uv sync ≈ 6-15 min); use a generous timeout. +oc wait pod/llama-stack-service -n "$NAMESPACE" --for=condition=Ready --timeout=900s oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" oc expose pod llama-stack-service --name=llama-stack-service-svc --port=8321 --type=ClusterIP -n "$NAMESPACE" diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index 7f7b3d9a4..a501490a6 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -13,10 +13,14 @@ # pod is recreated, that forward must be restarted or you get "PodSandbox ... not found" / # APIConnectionError on subsequent scenarios. # - E2E_LLAMA_PORT_FORWARD_PID_FILE coordinates killing/restarting the 8321 forward. +# - restart-lightspeed ensures Llama is running before LCS recreate when needed. +# - restart-both-services is available explicitly; restart-lightspeed / restart-llama-stack +# do not auto-trigger a full stack restart on failure. # # Commands: # restart-lightspeed - Restart lightspeed-stack pod and port-forward # restart-llama-stack - Restart/restore llama-stack pod and localhost:8321 forward +# restart-both-services - Full llama-stack then lightspeed-stack restart (explicit only) # restart-port-forward - Re-establish port-forward for lightspeed # restart-llama-port-forward - Re-establish port-forward for Llama Stack (8321) # wait-for-pod [attempts] - Wait for a pod to be ready @@ -25,6 +29,10 @@ # disrupt-llama-stack - Delete llama-stack pod to disrupt connection # deploy-e2e-tunnel-proxy - Deploy in-cluster tunnel proxy (proxy.feature step) # deploy-e2e-interception-proxy - Deploy in-cluster interception proxy (proxy.feature step) +# deploy-e2e-mock-tls-inference - Deploy mock HTTPS inference server (tls-*.feature) +# delete-e2e-mock-tls-inference - Remove mock TLS pod + Service (manual cleanup) +# restart-e2e-mock-tls-inference - Delete then deploy mock TLS (manual / recovery) +# sync-mock-tls-certs-secret - Copy mock /certs into Secret for llama-stack mount set -e @@ -40,21 +48,53 @@ E2E_JWKS_PORT_FORWARD_PID_FILE="${E2E_JWKS_PORT_FORWARD_PID_FILE:-/tmp/e2e-jwks- # Helper functions # ============================================================================ +# Print container logs only (no events/describe). Used on failure paths. +e2e_ops_dump_pod_logs() { + local pod_name="${1:?pod name required}" + local log_tail="${2:-150}" + local prefix="[e2e-ops] " + local ctr tmp + + echo "${prefix}--- logs: pod/$pod_name ---" + if ! oc get pod "$pod_name" -n "$NAMESPACE" &>/dev/null; then + echo "${prefix}pod not found, cannot fetch logs" + return 0 + fi + while IFS= read -r ctr; do + [[ -n "$ctr" ]] || continue + echo "${prefix}--- oc logs -c $ctr ---" + tmp=$(mktemp) + oc logs "$pod_name" -n "$NAMESPACE" -c "$ctr" --tail="$log_tail" >"$tmp" 2>&1 \ + || true + sed "s/^/${prefix}/" <"$tmp" + rm -f "$tmp" + done < <( + oc get pod "$pod_name" -n "$NAMESPACE" -o jsonpath='{range .spec.initContainers[*]}{.name}{"\n"}{end}{range .spec.containers[*]}{.name}{"\n"}{end}' 2>/dev/null \ + || true + ) +} + wait_for_pod() { local pod_name="$1" local max_attempts="${2:-24}" - + local attempt phase + for ((attempt=1; attempt<=max_attempts; attempt++)); do local ready ready=$(oc get pod "$pod_name" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[0].ready}' 2>/dev/null || echo "false") if [[ "$ready" == "true" ]]; then - echo "✓ Pod $pod_name ready" + echo "✓ Pod $pod_name ready (after $((attempt * 3))s)" return 0 fi + if (( max_attempts >= 60 && attempt % 20 == 0 )); then + phase=$(oc get pod "$pod_name" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "?") + echo "[e2e-ops] Still waiting for $pod_name ($attempt/${max_attempts}, phase=${phase})..." + fi sleep 3 done - + echo "Pod $pod_name not ready after $((max_attempts * 3))s" + e2e_ops_dump_pod_logs "$pod_name" 200 return 1 } @@ -184,8 +224,10 @@ e2e_ops_diagnose_forward_failure() { echo "[e2e-ops] /tmp/port-forward.log (tail 30):" tail -30 /tmp/port-forward.log 2>/dev/null | sed 's/^/[e2e-ops] /' || true fi - echo "[e2e-ops] oc get pods -n $NAMESPACE:" - oc get pods -n "$NAMESPACE" -o wide 2>&1 | sed 's/^/[e2e-ops] /' || true + e2e_ops_dump_pod_logs "lightspeed-stack-service" 10000 + if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then + e2e_ops_dump_pod_logs "llama-stack-service" 10000 + fi } verify_connectivity() { @@ -272,9 +314,7 @@ wait_for_llama_stack_http_health() { fi done echo "ERROR: Llama Stack did not respond on http://127.0.0.1:8321/v1/health inside the pod" - oc get pod llama-stack-service -n "$NAMESPACE" -o wide 2>&1 || true - oc describe pod llama-stack-service -n "$NAMESPACE" 2>&1 | tail -40 || true - oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=120 2>&1 || true + e2e_ops_dump_pod_logs "llama-stack-service" 200 return 1 } @@ -282,58 +322,9 @@ wait_for_llama_stack_http_health() { # Command implementations # ============================================================================ -cmd_restart_lightspeed() { - echo "Restarting lightspeed-stack service..." - - # LCS hangs at startup if Llama Stack is unreachable (blocks Llama handshake, - # never opens port 8080, readiness probe never passes). Ensure Llama Stack - # is healthy before recreating the LCS pod. - if ! _llama_stack_http_health_once 2>/dev/null; then - echo "⚠️ Llama Stack not healthy — restoring before LCS restart..." - cmd_restart_llama_stack || echo "⚠️ Llama Stack restore failed; LCS may be slow to start" - fi - - # Delete existing pod (short wait so hook stays within timeout; force if needed) - timeout 20 oc delete pod lightspeed-stack-service -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || { - oc delete pod lightspeed-stack-service -n "$NAMESPACE" --ignore-not-found=true --force --grace-period=0 2>/dev/null || true - sleep 2 - } - - # Apply manifest (expand LIGHTSPEED_STACK_IMAGE only; filter prevents blanking other $VAR refs) - LIGHTSPEED_STACK_IMAGE="${LIGHTSPEED_STACK_IMAGE:-quay.io/lightspeed-core/lightspeed-stack:dev-latest}" - export LIGHTSPEED_STACK_IMAGE - _ls_manifest="$MANIFEST_DIR/lightspeed-stack.yaml" - sed "s|\${LIGHTSPEED_STACK_IMAGE}|${LIGHTSPEED_STACK_IMAGE}|g" "$_ls_manifest" | - oc apply -n "$NAMESPACE" -f - - - # Wait for pod to be ready (TCP probe passes when app listens on 8080). - # Don't let a timeout here abort the function — still attempt port-forward - # and diagnostics so later scenarios have a chance to recover. - local pod_ready=true - if ! wait_for_pod "lightspeed-stack-service" 40; then - pod_ready=false - echo "⚠️ Pod not ready within 120s — dumping diagnostics:" - oc describe pod lightspeed-stack-service -n "$NAMESPACE" 2>&1 | tail -30 || true - oc logs lightspeed-stack-service -n "$NAMESPACE" --tail=40 2>&1 || true - fi - - # Re-label pod for service discovery - oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite - - # Re-establish port-forwards (may succeed even if readiness was slow) - cmd_restart_port_forward - cmd_restart_jwks_port_forward || echo "⚠️ Mock JWKS port-forward failed (RBAC tests may fail)" - - if [[ "$pod_ready" == "false" ]]; then - echo "⚠️ Lightspeed restart completed but pod was slow to become ready" - return 1 - fi - echo "✓ Lightspeed restart complete" -} - -cmd_restart_llama_stack() { +# Llama pod recreate + in-pod health + :8321 port-forward. +_restart_llama_stack_core() { echo "===== Restoring llama-stack service =====" - # Pod.spec is largely immutable; delete so apply creates a pod with current volumes/env. echo "Deleting llama-stack pod (if any) before apply..." timeout 45 oc delete pod llama-stack-service -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || { oc delete pod llama-stack-service -n "$NAMESPACE" --ignore-not-found=true --force --grace-period=0 2>/dev/null || true @@ -342,12 +333,19 @@ cmd_restart_llama_stack() { echo "Applying pod manifest..." if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]]; then - # Interception-proxy e2e: refresh Secret before pod recreate so the volume mount is populated. if [[ "${E2E_COPY_INTERCEPTION_CA_TO_LLAMA:-0}" == "1" ]]; then echo "[e2e-ops] Syncing e2e-interception-proxy-ca secret before llama-stack apply..." if ! cmd_sync_interception_proxy_ca_secret; then echo "===== Llama-stack restore FAILED (interception CA secret sync) =====" - exit 1 + return 1 + fi + fi + if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]] \ + && [[ "${E2E_SYNC_MOCK_TLS_CERTS:-0}" == "1" ]]; then + echo "[e2e-ops] Syncing e2e-mock-tls-certs secret before llama-stack apply..." + if ! cmd_sync_mock_tls_certs_secret; then + echo "===== Llama-stack restore FAILED (mock TLS cert secret sync) =====" + return 1 fi fi _LLAMA_SVC_FQDN="llama-stack-service-svc.${NAMESPACE}.svc.cluster.local" @@ -356,35 +354,116 @@ cmd_restart_llama_stack() { -n "$NAMESPACE" \ --dry-run=client -o yaml | oc apply -f - oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/llama-stack-openai.yaml" - wait_for_pod "llama-stack-service" 90 + # Konflux: setup-from-source init (dnf, git, uv) often needs 6–15 min before the main container is Ready. + local llama_wait_attempts=90 + if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]]; then + llama_wait_attempts=360 + fi + if ! wait_for_pod "llama-stack-service" "$llama_wait_attempts"; then + echo "===== Llama-stack restore FAILED (pod not ready) =====" + return 1 + fi echo "Labeling pod for service..." oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" --overwrite if [[ "${E2E_COPY_INTERCEPTION_CA_TO_LLAMA:-0}" == "1" ]]; then if ! _verify_interception_ca_mounted_in_llama; then echo "===== Llama-stack restore FAILED (interception CA not mounted) =====" - exit 1 + return 1 fi fi if ! wait_for_llama_stack_http_health 50; then echo "===== Llama-stack restore FAILED (HTTP not healthy) =====" - exit 1 + return 1 fi else - # Prow: vLLM Llama Stack image (matches pipeline.sh / pipeline-services.sh) - # Use sed instead of envsubst to avoid blanking $VAR references in embedded bash scripts sed "s|\${LLAMA_STACK_IMAGE}|${LLAMA_STACK_IMAGE:-}|g" "$MANIFEST_DIR/llama-stack-prow.yaml" | oc apply -n "$NAMESPACE" -f - - wait_for_pod "llama-stack-service" 24 + if ! wait_for_pod "llama-stack-service" 24; then + echo "===== Llama-stack restore FAILED (pod not ready) =====" + return 1 + fi echo "Labeling pod for service..." oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" --overwrite fi if ! cmd_restart_llama_port_forward; then echo "ERROR: Llama pod is up but localhost:${LOCAL_LLAMA_PORT:-8321} port-forward failed" - exit 1 + e2e_ops_dump_pod_logs "llama-stack-service" 200 + return 1 fi echo "===== Llama-stack restore complete =====" + return 0 +} + +# LCS pod recreate + :8080 port-forward (no cross-service recovery). +_restart_lightspeed_core() { + echo "Restarting lightspeed-stack service..." + + if ! _llama_stack_http_health_once 2>/dev/null; then + echo "⚠️ Llama Stack not healthy — restoring before LCS restart..." + if ! _restart_llama_stack_core; then + echo "===== Lightspeed restore FAILED (Llama not healthy) =====" + return 1 + fi + fi + + timeout 20 oc delete pod lightspeed-stack-service -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || { + oc delete pod lightspeed-stack-service -n "$NAMESPACE" --ignore-not-found=true --force --grace-period=0 2>/dev/null || true + sleep 2 + } + + LIGHTSPEED_STACK_IMAGE="${LIGHTSPEED_STACK_IMAGE:-quay.io/lightspeed-core/lightspeed-stack:dev-latest}" + export LIGHTSPEED_STACK_IMAGE + _ls_manifest="$MANIFEST_DIR/lightspeed-stack.yaml" + sed "s|\${LIGHTSPEED_STACK_IMAGE}|${LIGHTSPEED_STACK_IMAGE}|g" "$_ls_manifest" | + oc apply -n "$NAMESPACE" -f - + + local pod_ready=true + if ! wait_for_pod "lightspeed-stack-service" 40; then + pod_ready=false + echo "⚠️ Pod not ready within 120s" + fi + + oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite + + if ! cmd_restart_port_forward; then + echo "⚠️ Lightspeed port-forward failed" + return 1 + fi + cmd_restart_jwks_port_forward || echo "⚠️ Mock JWKS port-forward failed (RBAC tests may fail)" + + if [[ "$pod_ready" == "false" ]]; then + echo "⚠️ Lightspeed restart completed but pod was slow to become ready" + return 1 + fi + echo "✓ Lightspeed restart complete" + return 0 +} + +# Full stack reset: Llama first (LCS depends on it), then LCS + port-forwards. +cmd_restart_both_services() { + echo "===== Full restart: llama-stack then lightspeed-stack =====" + local rc=0 + if ! _restart_llama_stack_core; then + rc=1 + elif ! _restart_lightspeed_core; then + rc=1 + fi + if [[ $rc -eq 0 ]]; then + echo "===== Full both-services restart complete =====" + else + echo "===== Full both-services restart FAILED =====" + fi + return $rc +} + +cmd_restart_llama_stack() { + _restart_llama_stack_core +} + +cmd_restart_lightspeed() { + _restart_lightspeed_core } cmd_restart_port_forward() { @@ -525,8 +604,10 @@ cmd_restart_llama_port_forward() { echo "Failed to establish Llama Stack port-forward on :$local_port" if [[ -s "$llama_pf_log" ]]; then + echo "[e2e-ops] $llama_pf_log (tail 30):" tail -30 "$llama_pf_log" 2>/dev/null | sed 's/^/[e2e-ops] /' || true fi + e2e_ops_dump_pod_logs "llama-stack-service" 200 return 1 } @@ -739,12 +820,153 @@ cmd_deploy_e2e_interception_proxy() { oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/e2e-interception-proxy.yaml" if ! oc wait pod/e2e-interception-proxy -n "$NAMESPACE" --for=condition=Ready --timeout=180s; then echo "ERROR: e2e-interception-proxy failed to become ready" >&2 - oc describe pod e2e-interception-proxy -n "$NAMESPACE" 2>/dev/null | tail -25 || true + e2e_ops_dump_pod_logs "e2e-interception-proxy" 80 return 1 fi echo "✓ e2e-interception-proxy ready at http://e2e-interception-proxy.${NAMESPACE}.svc.cluster.local:8889" } +_MOCK_TLS_CERT_FILES=( + ca.crt client.crt client.key untrusted-ca.crt expired-ca.crt + untrusted-client.crt untrusted-client.key expired-client.crt +) + +# Copy one PEM from the mock pod; retries oc exec when kubelet returns transient EOF. +_mock_tls_copy_cert_from_pod() { + local mock_pod="$1" + local cert_name="$2" + local dest="$3" + local max_attempts="${4:-6}" + local attempt + + for ((attempt=1; attempt<=max_attempts; attempt++)); do + if oc exec -n "$NAMESPACE" "$mock_pod" -c e2e-mock-tls-inference --request-timeout=60s -- \ + cat "/certs/${cert_name}" >"$dest" 2>/dev/null && [[ -s "$dest" ]]; then + return 0 + fi + rm -f "$dest" + if [[ $attempt -lt $max_attempts ]]; then + echo "[e2e-ops] WARN: oc exec /certs/${cert_name} failed (attempt $attempt/$max_attempts); retrying..." + sleep $((attempt * 2)) + fi + done + return 1 +} + +# Read all mock /certs into a temp dir (caller removes tmpdir). +_sync_mock_tls_certs_from_running_pod() { + local mock_pod="e2e-mock-tls-inference" + local f tmpdir from_args + + tmpdir=$(mktemp -d) + for f in "${_MOCK_TLS_CERT_FILES[@]}"; do + if ! _mock_tls_copy_cert_from_pod "$mock_pod" "$f" "${tmpdir}/${f}"; then + echo "ERROR: failed to read /certs/${f} from $mock_pod after retries" >&2 + rm -rf "$tmpdir" + return 1 + fi + done + from_args=() + for f in "${_MOCK_TLS_CERT_FILES[@]}"; do + from_args+=(--from-file="${f}=${tmpdir}/${f}") + done + oc create secret generic e2e-mock-tls-certs \ + "${from_args[@]}" -n "$NAMESPACE" \ + --dry-run=client -o yaml | oc apply -n "$NAMESPACE" -f - + rm -rf "$tmpdir" + return 0 +} + +# Apply mock TLS manifest and wait for Ready (no Secret sync). +_deploy_e2e_mock_tls_inference_pod() { + local repo_root server_py + repo_root="$(_e2e_repo_root)" + server_py="$repo_root/tests/e2e/mock_tls_inference_server/server.py" + if [[ ! -f "$server_py" ]]; then + echo "ERROR: missing $server_py" >&2 + return 1 + fi + echo "Deploying e2e-mock-tls-inference pod in namespace $NAMESPACE..." + oc create configmap e2e-mock-tls-inference-script -n "$NAMESPACE" \ + --from-file=server.py="$server_py" \ + --dry-run=client -o yaml | oc apply -f - + oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/e2e-mock-tls-inference.yaml" + if ! oc wait pod/e2e-mock-tls-inference -n "$NAMESPACE" --for=condition=Ready --timeout=240s; then + echo "ERROR: e2e-mock-tls-inference failed to become ready" >&2 + e2e_ops_dump_pod_logs "e2e-mock-tls-inference" 80 + return 1 + fi + return 0 +} + +cmd_sync_mock_tls_certs_secret() { + local mock_pod="e2e-mock-tls-inference" + local pass + + for pass in 1 2; do + if ! oc get pod "$mock_pod" -n "$NAMESPACE" &>/dev/null; then + echo "[e2e-ops] WARN: $mock_pod not found — deploying mock TLS inference..." + if ! _deploy_e2e_mock_tls_inference_pod; then + return 1 + fi + elif ! oc wait pod/"$mock_pod" -n "$NAMESPACE" --for=condition=Ready --timeout=120s; then + echo "[e2e-ops] WARN: $mock_pod not Ready — redeploying mock TLS inference..." + cmd_delete_e2e_mock_tls_inference + if ! _deploy_e2e_mock_tls_inference_pod; then + return 1 + fi + fi + + if _sync_mock_tls_certs_from_running_pod; then + echo "✓ Secret e2e-mock-tls-certs updated" + return 0 + fi + + if [[ $pass -eq 1 ]]; then + echo "[e2e-ops] WARN: mock TLS cert sync failed (oc exec/kubelet) — redeploying mock pod and retrying..." + cmd_delete_e2e_mock_tls_inference + if ! _deploy_e2e_mock_tls_inference_pod; then + return 1 + fi + continue + fi + + e2e_ops_dump_pod_logs "$mock_pod" 80 + return 1 + done + return 1 +} + +cmd_delete_e2e_mock_tls_inference() { + echo "Removing e2e-mock-tls-inference from namespace $NAMESPACE..." + timeout 60 oc delete pod e2e-mock-tls-inference -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || { + oc delete pod e2e-mock-tls-inference -n "$NAMESPACE" --ignore-not-found=true --force --grace-period=0 2>/dev/null || true + sleep 2 + } + oc delete svc e2e-mock-tls-inference -n "$NAMESPACE" --ignore-not-found=true 2>/dev/null || true + echo "✓ e2e-mock-tls-inference removed" +} + +cmd_deploy_e2e_mock_tls_inference() { + if ! _deploy_e2e_mock_tls_inference_pod; then + return 1 + fi + if ! cmd_sync_mock_tls_certs_secret; then + return 1 + fi + echo "✓ e2e-mock-tls-inference ready" +} + +cmd_restart_e2e_mock_tls_inference() { + echo "Restarting e2e-mock-tls-inference (delete + deploy)..." + cmd_delete_e2e_mock_tls_inference + cmd_deploy_e2e_mock_tls_inference +} + +cmd_dump_pod_logs() { + e2e_ops_dump_pod_logs "${1:?pod name required}" "${2:-150}" +} + cmd_disrupt_llama_stack() { local pod_name="llama-stack-service" @@ -776,6 +998,9 @@ case "$COMMAND" in restart-llama-stack) cmd_restart_llama_stack ;; + restart-both-services) + cmd_restart_both_services + ;; restart-llama-port-forward) cmd_restart_llama_port_forward ;; @@ -815,12 +1040,28 @@ case "$COMMAND" in deploy-e2e-interception-proxy) cmd_deploy_e2e_interception_proxy ;; + deploy-e2e-mock-tls-inference) + cmd_deploy_e2e_mock_tls_inference + ;; + delete-e2e-mock-tls-inference) + cmd_delete_e2e_mock_tls_inference + ;; + restart-e2e-mock-tls-inference) + cmd_restart_e2e_mock_tls_inference + ;; + sync-mock-tls-certs-secret) + cmd_sync_mock_tls_certs_secret + ;; + dump-pod-logs) + cmd_dump_pod_logs "$@" + ;; *) echo "Usage: $0 [args...]" echo "" echo "Commands:" echo " restart-lightspeed - Restart lightspeed-stack pod and port-forward" echo " restart-llama-stack - Restart/restore llama-stack pod" + echo " restart-both-services - Full llama-stack + lightspeed-stack restart (explicit)" echo " restart-llama-port-forward - Re-establish port-forward for Llama (8321)" echo " restart-port-forward - Re-establish port-forward for lightspeed" echo " wait-for-pod [attempts] - Wait for a pod to be ready" @@ -833,6 +1074,11 @@ case "$COMMAND" in echo " sync-interception-proxy-ca-secret - Publish trustme CA to Secret for llama mount" echo " deploy-e2e-tunnel-proxy - Deploy in-cluster tunnel proxy pod" echo " deploy-e2e-interception-proxy - Deploy in-cluster interception proxy pod" + echo " deploy-e2e-mock-tls-inference - Deploy mock HTTPS inference (tls-*.feature)" + echo " delete-e2e-mock-tls-inference - Remove mock TLS pod + Service" + echo " restart-e2e-mock-tls-inference - Delete then deploy mock TLS (recovery)" + echo " sync-mock-tls-certs-secret - Publish mock TLS /certs to Secret" + echo " dump-pod-logs [tail-lines] - Print init + container logs" exit 1 ;; esac diff --git a/tests/e2e/configuration/server-mode/lightspeed-stack-tls.yaml b/tests/e2e/configuration/server-mode/lightspeed-stack-tls.yaml index babdc2b99..fd45ea744 100644 --- a/tests/e2e/configuration/server-mode/lightspeed-stack-tls.yaml +++ b/tests/e2e/configuration/server-mode/lightspeed-stack-tls.yaml @@ -8,7 +8,7 @@ service: access_log: true llama_stack: use_as_library_client: false - url: http://llama-stack:8321 + url: http://${env.E2E_LLAMA_HOSTNAME}:8321 api_key: xyzzy user_data_collection: feedback_enabled: true diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py index fdca1247c..e44cc23c3 100644 --- a/tests/e2e/features/environment.py +++ b/tests/e2e/features/environment.py @@ -26,6 +26,11 @@ reset_llama_stack_disrupt_once_tracking, reset_llama_stack_was_running, ) +from tests.e2e.features.steps.tls import ( + is_tls_feature_file, + prepare_tls_feature_entry_on_prow, + reset_tls_prow_state, +) from tests.e2e.utils.llama_stack_utils import register_shield from tests.e2e.utils.prow_utils import ( restart_pod, @@ -237,24 +242,26 @@ def before_scenario(context: Context, scenario: Scenario) -> None: delattr(context, _attr) -def _dump_pod_logs_on_failure(scenario: Scenario, namespace: str) -> None: - """Dump llama-stack and lightspeed-stack pod logs when a scenario fails in Prow.""" +def _dump_pod_logs_on_failure( + context: Context, scenario: Scenario, namespace: str +) -> None: + """Dump container logs when a scenario fails in Prow.""" if scenario.status != "failed": return - for pod in ("llama-stack-service", "lightspeed-stack-service"): - print(f"--- {pod} logs (scenario failed: {scenario.name}) ---") + pods: tuple[str, ...] = ("llama-stack-service", "lightspeed-stack-service") + feature = getattr(context, "feature", None) + feat_file = getattr(feature, "filename", "") or "" if feature else "" + if is_tls_feature_file(feat_file): + pods = (*pods, "e2e-mock-tls-inference") + print(f"--- scenario failed: {scenario.name!r} — pod logs ---", flush=True) + for pod in pods: try: - r = subprocess.run( - ["oc", "logs", pod, "-n", namespace, "--tail=100"], - capture_output=True, - text=True, - timeout=15, - check=False, - ) - print(r.stdout or r.stderr or "(no output)") + result = run_e2e_ops("dump-pod-logs", [pod, "150"], timeout=90) + print(result.stdout, end="") + if result.stderr: + print(result.stderr, end="") except subprocess.TimeoutExpired: - print("(timed out fetching logs)") - print(f"--- end {pod} logs ---") + print(f"(timed out fetching logs for {pod})") def after_scenario(context: Context, scenario: Scenario) -> None: @@ -288,7 +295,7 @@ def after_scenario(context: Context, scenario: Scenario) -> None: """ if is_prow_environment(): _dump_pod_logs_on_failure( - scenario, os.environ.get("NAMESPACE", "e2e-rhoai-dsc") + context, scenario, os.environ.get("NAMESPACE", "e2e-rhoai-dsc") ) if getattr(context, "scenario_lightspeed_override_active", False): @@ -451,6 +458,9 @@ def before_feature(context: Context, feature: Feature) -> None: context.active_lightspeed_stack_config_basename = None # One real Llama disruption per feature (module-level flag; survives context resets) reset_llama_stack_disrupt_once_tracking() + if feature.filename and is_tls_feature_file(feature.filename): + reset_tls_prow_state() + prepare_tls_feature_entry_on_prow(feature.filename) try: max_flaky = int(os.getenv("E2E_FLAKY_MAX_ATTEMPTS", _E2E_FLAKY_MAX_ATTEMPTS)) @@ -520,5 +530,6 @@ def after_feature(context: Context, feature: Feature) -> None: # Behave captures hook stdout by default; output is only shown in some failure paths. -# Disable capture so feature timing lines always appear on the real console/CI log. +# Disable capture so feature timing and failure diagnostics appear in the main CI log. after_feature.capture = False # type: ignore[attr-defined] +after_scenario.capture = False # type: ignore[attr-defined] diff --git a/tests/e2e/features/proxy.feature b/tests/e2e/features/proxy.feature index 907c4317d..00fde258a 100644 --- a/tests/e2e/features/proxy.feature +++ b/tests/e2e/features/proxy.feature @@ -1,4 +1,4 @@ -@e2e_group_3 @skip-in-library-mode +@e2e_group_3 @skip-in-library-mode @skip-in-prow Feature: Proxy and TLS networking tests for Llama Stack providers Verify that the Lightspeed Stack works correctly when Llama Stack's diff --git a/tests/e2e/features/steps/proxy.py b/tests/e2e/features/steps/proxy.py index 3fb29b270..7755cca91 100644 --- a/tests/e2e/features/steps/proxy.py +++ b/tests/e2e/features/steps/proxy.py @@ -305,6 +305,7 @@ def restore_if_modified(context: Context) -> None: _stop_proxy(context, "tunnel_proxy", "proxy_loop") _stop_proxy(context, "interception_proxy", "interception_proxy_loop") os.environ.pop("E2E_COPY_INTERCEPTION_CA_TO_LLAMA", None) + os.environ.pop("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA", None) if hasattr(context, "needs_interception_ca_on_llama"): delattr(context, "needs_interception_ca_on_llama") @@ -318,6 +319,14 @@ def restore_if_modified(context: Context) -> None: @given("Llama Stack is restarted") def restart_llama_stack(context: Context) -> None: """Restart the Llama Stack container.""" + from tests.e2e.features.steps.tls import ( + is_tls_configuration_feature, + restart_llama_for_tls_feature, + ) + + if is_tls_configuration_feature(context): + restart_llama_for_tls_feature(context) + return restart_container("llama-stack") diff --git a/tests/e2e/features/steps/tls.py b/tests/e2e/features/steps/tls.py index 66d56adcc..b0ea19edf 100644 --- a/tests/e2e/features/steps/tls.py +++ b/tests/e2e/features/steps/tls.py @@ -9,6 +9,7 @@ """ import copy +import os from typing import Any, Optional from behave import given # pyright: ignore[reportAttributeAccessIssue] @@ -16,19 +17,17 @@ from tests.e2e.utils.llama_config_utils import ( backup_llama_config, + clear_llama_config_backup, load_llama_config, + reset_llama_run_config_to_pipeline_default, write_llama_config, ) +from tests.e2e.utils.prow_utils import get_namespace, restart_pod, run_e2e_ops +from tests.e2e.utils.utils import is_prow_environment -_TLS_PROVIDER_BASE: dict[str, Any] = { - "provider_id": "tls-openai", - "provider_type": "remote::openai", - "config": { - "api_key": "test-key", - "base_url": "https://mock-tls-inference:8443/v1", - "allowed_models": ["mock-tls-model"], - }, -} +_MOCK_TLS_PORT_TLS = 8443 +_MOCK_TLS_PORT_MTLS = 8444 +_MOCK_TLS_PORT_HOSTNAME_MISMATCH = 8445 _TLS_MODEL_RESOURCE: dict[str, str] = { "model_id": "mock-tls-model", @@ -37,6 +36,167 @@ } +def reset_tls_prow_state() -> None: + """Reset per-feature TLS test state (call from ``before_feature``).""" + os.environ.pop("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA", None) + os.environ.pop("E2E_SYNC_MOCK_TLS_CERTS", None) + os.environ.pop("E2E_MOCK_TLS_INFERENCE_HOST", None) + clear_llama_config_backup() + + +def is_tls_feature_file(feature_filename: str | None) -> bool: + """Return True for split TLS feature files (``tls-ca.feature``, etc.).""" + if not feature_filename: + return False + basename = os.path.basename(feature_filename) + return basename.startswith("tls-") and basename.endswith(".feature") + + +def prepare_tls_feature_entry_on_prow(feature_filename: str | None = None) -> None: + """Baseline cluster state at the start of each tls-*.feature file. + + Mock TLS stays up for the whole tls suite (tls-ca → tls-mtls → tls-tlsv13). + Certs are synced to the Secret only when the mock pod is first deployed + (``deploy-e2e-mock-tls-inference``). Per-scenario TLS cases change which + ``/certs/*`` path Llama uses via run.yaml, not the Secret contents. + """ + if not is_prow_environment(): + return + label = os.path.basename(feature_filename or "tls.feature") + print(f"[{label}] Prow/Konflux entry: ensure mock TLS, reset run.yaml, warm Llama...") + reset_llama_run_config_to_pipeline_default() + _ensure_cluster_mock_tls_inference() + _prepare_tls_prow_llama_restart_env() + os.environ.pop("E2E_SYNC_MOCK_TLS_CERTS", None) + restart_pod("llama-stack") + print(f"[{label}] Prow/Konflux entry baseline complete", flush=True) + + +def is_tls_configuration_feature(context: Context) -> bool: + """Return True when the active Behave feature is a tls-*.feature file.""" + feature = getattr(context, "feature", None) + if feature is None: + return False + filename = getattr(feature, "filename", None) + if is_tls_feature_file(filename): + return True + name = getattr(feature, "name", "") or "" + return "TLS configuration" in name + + +def _prepare_tls_prow_llama_restart_env() -> None: + """Set env for full llama pod recreate with mock TLS certs mounted.""" + os.environ["E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA"] = "1" + + +def _restart_lightspeed_after_llama_tls(context: Context) -> None: + """Restart LCS after Llama recreate so the in-process Llama client reconnects. + + TLS scenarios only change Llama run.yaml; LCS yaml is unchanged. Without this, + queries through LCS often fail with 503/connection errors after Llama pod + recreate on Prow (stale HTTP connections). + """ + from tests.e2e.utils.utils import ( + restart_container, + wait_for_lightspeed_stack_http_ready, + ) + + scenario = getattr(getattr(context, "scenario", None), "name", "") or "?" + feature_file = os.path.basename( + getattr(getattr(context, "feature", None), "filename", "") or "tls.feature" + ) + print( + f"[{feature_file}] Lightspeed Stack refresh after Llama recreate " + f"scenario={scenario!r}", + flush=True, + ) + restart_container("lightspeed-stack") + wait_for_lightspeed_stack_http_ready() + + +def restart_llama_for_tls_feature(context: Context) -> None: + """Restart Llama for TLS tests, then refresh LCS (full pod recreate on Prow/Konflux).""" + from tests.e2e.utils.utils import restart_container + + if is_prow_environment(): + _prepare_tls_prow_llama_restart_env() + os.environ.pop("E2E_SYNC_MOCK_TLS_CERTS", None) + scenario = getattr(getattr(context, "scenario", None), "name", "") or "?" + feature_file = os.path.basename( + getattr(getattr(context, "feature", None), "filename", "") or "tls.feature" + ) + print( + f"[{feature_file}] Llama Stack restart: full recreate scenario={scenario!r}", + flush=True, + ) + restart_container("llama-stack") + _restart_lightspeed_after_llama_tls(context) + + +def _cluster_mock_tls_inference_host() -> str: + """DNS name of the in-cluster mock TLS inference server (Konflux / Prow).""" + explicit = os.getenv("E2E_MOCK_TLS_INFERENCE_HOST", "").strip() + if explicit: + return explicit + return f"e2e-mock-tls-inference.{get_namespace()}.svc.cluster.local" + + +def _mock_tls_base_url(port: int) -> str: + """OpenAI-compatible base URL for the mock TLS inference server.""" + if is_prow_environment(): + host = _cluster_mock_tls_inference_host() + else: + host = "mock-tls-inference" + return f"https://{host}:{port}/v1" + + +def _tls_provider_base() -> dict[str, Any]: + """Default tls-openai provider dict with environment-appropriate base_url.""" + return { + "provider_id": "tls-openai", + "provider_type": "remote::openai", + "config": { + "api_key": "test-key", + "base_url": _mock_tls_base_url(_MOCK_TLS_PORT_TLS), + "allowed_models": ["mock-tls-model"], + "refresh_models": False, + }, + } + + +def _mock_tls_inference_pod_ready() -> bool: + """Return True when the in-cluster mock TLS pod exists and is Ready.""" + result = run_e2e_ops("wait-for-pod", ["e2e-mock-tls-inference", "1"], timeout=45) + return result.returncode == 0 + + +def _ensure_cluster_mock_tls_inference() -> None: + """Deploy mock TLS on Prow if missing; keep one pod for the whole tls suite. + + ``deploy-e2e-mock-tls-inference`` copies all PEMs into Secret ``e2e-mock-tls-certs`` + once. Scenarios only change which cert path Llama uses in run.yaml. + """ + if _mock_tls_inference_pod_ready(): + print("Using existing e2e-mock-tls-inference deployment") + os.environ.setdefault( + "E2E_MOCK_TLS_INFERENCE_HOST", + _cluster_mock_tls_inference_host(), + ) + return + + result = run_e2e_ops("deploy-e2e-mock-tls-inference", timeout=300) + print(result.stdout, end="") + if result.returncode != 0: + raise RuntimeError( + "Failed to deploy e2e-mock-tls-inference: " + f"{result.stderr or result.stdout}" + ) + os.environ.setdefault( + "E2E_MOCK_TLS_INFERENCE_HOST", + _cluster_mock_tls_inference_host(), + ) + + def _ensure_tls_provider(config: dict[str, Any]) -> dict[str, Any]: """Find or create the tls-openai inference provider in the config. @@ -59,7 +219,7 @@ def _ensure_tls_provider(config: dict[str, Any]) -> dict[str, Any]: return provider # Provider not found — add it - provider = copy.deepcopy(_TLS_PROVIDER_BASE) + provider = copy.deepcopy(_tls_provider_base()) inference.append(provider) # Also register the model resource @@ -85,13 +245,27 @@ def _configure_tls(tls_config: dict[str, Any], base_url: Optional[str] = None) - provider.setdefault("config", {}).setdefault("network", {}) if base_url is not None: provider["config"]["base_url"] = base_url + else: + provider["config"]["base_url"] = _mock_tls_base_url(_MOCK_TLS_PORT_TLS) + provider.setdefault("config", {})["refresh_models"] = False provider["config"]["network"]["tls"] = tls_config write_llama_config(config) + if is_prow_environment(): + _prepare_tls_prow_llama_restart_env() # --- Background Steps --- # ``The original Llama Stack config is restored if modified`` only restores -# run.yaml (see proxy.py). Restart steps are listed in tls.feature / proxy.feature. +# run.yaml (see proxy.py). Restart steps are listed in tls-*.feature / proxy.feature. + + +@given("The mock TLS inference server is deployed") +def deploy_mock_tls_inference_server(context: Context) -> None: + """Ensure mock TLS inference is reachable (Compose locally, pod in Prow).""" + if is_prow_environment(): + _ensure_cluster_mock_tls_inference() + return + print("Using docker-compose mock-tls-inference service") # --- TLS Configuration Steps --- @@ -121,30 +295,25 @@ def configure_tls_mtls(context: Context) -> None: _configure_tls( { "verify": "/certs/ca.crt", + "min_version": "TLSv1.2", "client_cert": "/certs/client.crt", "client_key": "/certs/client.key", }, - base_url="https://mock-tls-inference:8444/v1", + base_url=_mock_tls_base_url(_MOCK_TLS_PORT_MTLS), ) -@given('Llama Stack is configured with CA certificate path "{path}"') -def configure_tls_verify_ca_path(context: Context, path: str) -> None: - """Configure run.yaml with TLS verify pointing to a specific CA cert path.""" - _configure_tls({"verify": path}) - - @given("Llama Stack is configured for mTLS without client certificate") -def configure_mtls_no_client_cert(context: Context) -> None: +def configure_tls_mtls_no_client_cert(context: Context) -> None: """Configure run.yaml for mTLS port without client cert (should fail).""" _configure_tls( {"verify": "/certs/ca.crt"}, - base_url="https://mock-tls-inference:8444/v1", + base_url=_mock_tls_base_url(_MOCK_TLS_PORT_MTLS), ) @given("Llama Stack is configured for mTLS with wrong client certificate") -def configure_mtls_wrong_client_cert(context: Context) -> None: +def configure_tls_mtls_wrong_client_cert(context: Context) -> None: """Configure run.yaml for mTLS with invalid client cert (CA cert as client cert).""" _configure_tls( { @@ -152,58 +321,75 @@ def configure_mtls_wrong_client_cert(context: Context) -> None: "client_cert": "/certs/ca.crt", "client_key": "/certs/client.key", }, - base_url="https://mock-tls-inference:8444/v1", + base_url=_mock_tls_base_url(_MOCK_TLS_PORT_MTLS), ) @given("Llama Stack is configured for mTLS with untrusted client certificate") -def configure_mtls_untrusted_client_cert(context: Context) -> None: - """Configure run.yaml for mTLS with client cert from untrusted CA.""" +def configure_tls_mtls_untrusted_client_cert(context: Context) -> None: + """Configure run.yaml with untrusted client certificate.""" _configure_tls( { "verify": "/certs/ca.crt", + "min_version": "TLSv1.2", "client_cert": "/certs/untrusted-client.crt", "client_key": "/certs/untrusted-client.key", }, - base_url="https://mock-tls-inference:8444/v1", + base_url=_mock_tls_base_url(_MOCK_TLS_PORT_MTLS), ) @given("Llama Stack is configured for mTLS with expired client certificate") -def configure_mtls_expired_client_cert(context: Context) -> None: - """Configure run.yaml for mTLS with an expired client certificate.""" +def configure_tls_mtls_expired_client_cert(context: Context) -> None: + """Configure run.yaml with expired client certificate.""" _configure_tls( { "verify": "/certs/ca.crt", + "min_version": "TLSv1.2", "client_cert": "/certs/expired-client.crt", "client_key": "/certs/client.key", }, - base_url="https://mock-tls-inference:8444/v1", + base_url=_mock_tls_base_url(_MOCK_TLS_PORT_MTLS), ) @given("Llama Stack is configured with CA certificate and hostname mismatch server") -def configure_tls_hostname_mismatch(context: Context) -> None: +def configure_tls_ca_hostname_mismatch(context: Context) -> None: """Configure run.yaml to connect to hostname-mismatch server (should fail).""" _configure_tls( {"verify": "/certs/ca.crt"}, - base_url="https://mock-tls-inference:8445/v1", + base_url=_mock_tls_base_url(_MOCK_TLS_PORT_HOSTNAME_MISMATCH), ) @given("Llama Stack is configured with mutual TLS and hostname mismatch server") -def configure_mtls_hostname_mismatch(context: Context) -> None: - """Configure run.yaml for mTLS against hostname-mismatch server (should fail).""" +def configure_tls_mtls_hostname_mismatch(context: Context) -> None: + """Configure run.yaml with mTLS against hostname-mismatch server.""" _configure_tls( { "verify": "/certs/ca.crt", + "min_version": "TLSv1.2", "client_cert": "/certs/client.crt", "client_key": "/certs/client.key", }, - base_url="https://mock-tls-inference:8445/v1", + base_url=_mock_tls_base_url(_MOCK_TLS_PORT_HOSTNAME_MISMATCH), ) +@given('Llama Stack is configured with CA certificate path "{path}"') +def configure_tls_ca_path(context: Context, path: str) -> None: + """Configure run.yaml with TLS verify pointing to a specific CA cert path.""" + _configure_tls({"verify": path}) + + +@given( + 'Llama Stack is configured with TLS minimum version "{version}" and CA certificate path "{path}"' +) +def configure_tls_min_version_and_ca(context: Context, version: str, path: str) -> None: + """Configure run.yaml with TLS minimum version and a specific CA cert path.""" + _configure_tls({"verify": path, "min_version": version}) + + @given( 'Llama Stack is configured with TLS minimum version "{version}" and hostname mismatch server' ) @@ -211,15 +397,5 @@ def configure_tls_min_version_hostname_mismatch(context: Context, version: str) """Configure run.yaml with TLS min version against hostname-mismatch server.""" _configure_tls( {"verify": "/certs/ca.crt", "min_version": version}, - base_url="https://mock-tls-inference:8445/v1", + base_url=_mock_tls_base_url(_MOCK_TLS_PORT_HOSTNAME_MISMATCH), ) - - -@given( - 'Llama Stack is configured with TLS minimum version "{version}" and CA certificate path "{path}"' -) -def configure_tls_min_version_with_ca_path( - context: Context, version: str, path: str -) -> None: - """Configure run.yaml with TLS minimum version and a specific CA cert path.""" - _configure_tls({"verify": path, "min_version": version}) diff --git a/tests/e2e/features/tls-ca.feature b/tests/e2e/features/tls-ca.feature new file mode 100644 index 000000000..016e3b964 --- /dev/null +++ b/tests/e2e/features/tls-ca.feature @@ -0,0 +1,75 @@ +@e2e_group_1 @skip-in-library-mode +Feature: TLS configuration — CA certificate verification + Validate Llama Stack NetworkConfig.tls CA trust settings against the mock HTTPS + inference provider (standard TLS port). + + # Only Llama run.yaml changes per scenario; LCS uses lightspeed-stack-tls.yaml + # throughout (Background). ``Llama Stack is restarted`` also refreshes LCS. + + Background: + Given The service is started locally + And The system is in default state + And REST API service prefix is /v1 + And the Lightspeed stack configuration directory is "tests/e2e/configuration" + And The original Llama Stack config is restored if modified + And The mock TLS inference server is deployed + And The service uses the lightspeed-stack-tls.yaml configuration + And The service is restarted + + Scenario: Inference succeeds with TLS verification disabled + Given Llama Stack is configured with TLS verification disabled + And Llama Stack is restarted + When I use "query" to ask question + """ + {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} + """ + Then The status code of the response is 200 + + Scenario: Inference succeeds with CA certificate verification + Given Llama Stack is configured with CA certificate verification + And Llama Stack is restarted + When I use "query" to ask question + """ + {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} + """ + Then The status code of the response is 200 + + Scenario: Inference fails with an untrusted CA certificate + Given Llama Stack is configured with CA certificate path "/certs/untrusted-ca.crt" + And Llama Stack is restarted + When I use "query" to ask question + """ + {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} + """ + Then The status code of the response is 500 + And The body of the response does not contain Hello from the TLS mock inference server + + Scenario: Inference fails with an expired CA certificate + Given Llama Stack is configured with CA certificate path "/certs/expired-ca.crt" + And Llama Stack is restarted + When I use "query" to ask question + """ + {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} + """ + Then The status code of the response is 500 + And The body of the response does not contain Hello from the TLS mock inference server + + Scenario: Inference fails when TLS verify is true against self-signed cert + Given Llama Stack is configured with TLS verification enabled + And Llama Stack is restarted + When I use "query" to ask question + """ + {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} + """ + Then The status code of the response is 500 + And The body of the response does not contain Hello from the TLS mock inference server + + Scenario: Inference fails with CA certificate verification and hostname mismatch + Given Llama Stack is configured with CA certificate and hostname mismatch server + And Llama Stack is restarted + When I use "query" to ask question + """ + {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} + """ + Then The status code of the response is 500 + And The body of the response does not contain Hello from the TLS mock inference server diff --git a/tests/e2e/features/tls-mtls.feature b/tests/e2e/features/tls-mtls.feature new file mode 100644 index 000000000..ba74182db --- /dev/null +++ b/tests/e2e/features/tls-mtls.feature @@ -0,0 +1,76 @@ +@e2e_group_1 @skip-in-library-mode +Feature: TLS configuration — mutual TLS authentication + Validate Llama Stack NetworkConfig.tls client certificate settings against the + mock HTTPS inference provider (mTLS port). + + # Only Llama run.yaml changes per scenario; LCS uses lightspeed-stack-tls.yaml + # throughout (Background). ``Llama Stack is restarted`` also refreshes LCS. + + Background: + Given The service is started locally + And The system is in default state + And REST API service prefix is /v1 + And the Lightspeed stack configuration directory is "tests/e2e/configuration" + And The original Llama Stack config is restored if modified + And The mock TLS inference server is deployed + And The service uses the lightspeed-stack-tls.yaml configuration + And The service is restarted + + Scenario: Inference succeeds with mutual TLS authentication + Given Llama Stack is configured with mutual TLS authentication + And Llama Stack is restarted + When I use "query" to ask question + """ + {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} + """ + Then The status code of the response is 200 + + Scenario: Inference fails when mTLS is required but no client certificate is provided + Given Llama Stack is configured for mTLS without client certificate + And Llama Stack is restarted + When I use "query" to ask question + """ + {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} + """ + Then The status code of the response is 500 + And The body of the response does not contain Hello from the TLS mock inference server + + Scenario: Inference fails when mTLS is required but wrong client certificate is provided + Given Llama Stack is configured for mTLS with wrong client certificate + And Llama Stack is restarted + When I use "query" to ask question + """ + {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} + """ + Then The status code of the response is 500 + And The body of the response does not contain Hello from the TLS mock inference server + + Scenario: Inference fails when mTLS is required but untrusted client certificate is provided + Given Llama Stack is configured for mTLS with untrusted client certificate + And Llama Stack is restarted + When I use "query" to ask question + """ + {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} + """ + Then The status code of the response is 500 + And The body of the response does not contain Hello from the TLS mock inference server + + Scenario: Inference fails when mTLS is required but expired client certificate is provided + Given Llama Stack is configured for mTLS with expired client certificate + And Llama Stack is restarted + When I use "query" to ask question + """ + {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} + """ + Then The status code of the response is 500 + And The body of the response does not contain Hello from the TLS mock inference server + + Scenario: Inference fails with mutual TLS and hostname mismatch + Given Llama Stack is configured with mutual TLS and hostname mismatch server + And Llama Stack is restarted + When I use "query" to ask question + """ + {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} + """ + Then The status code of the response is 500 + And The body of the response does not contain Hello from the TLS mock inference server diff --git a/tests/e2e/features/tls-tlsv13.feature b/tests/e2e/features/tls-tlsv13.feature new file mode 100644 index 000000000..a9cb4d9a0 --- /dev/null +++ b/tests/e2e/features/tls-tlsv13.feature @@ -0,0 +1,56 @@ +@e2e_group_1 @skip-in-library-mode +Feature: TLS configuration — TLS minimum version 1.3 + Validate Llama Stack NetworkConfig.tls min_version TLSv1.3 against the mock + HTTPS inference provider. + + # Only Llama run.yaml changes per scenario; LCS uses lightspeed-stack-tls.yaml + # throughout (Background). ``Llama Stack is restarted`` also refreshes LCS. + + Background: + Given The service is started locally + And The system is in default state + And REST API service prefix is /v1 + And the Lightspeed stack configuration directory is "tests/e2e/configuration" + And The original Llama Stack config is restored if modified + And The mock TLS inference server is deployed + And The service uses the lightspeed-stack-tls.yaml configuration + And The service is restarted + + Scenario: Inference succeeds with TLS minimum version TLSv1.3 + Given Llama Stack is configured with TLS minimum version "TLSv1.3" and CA certificate path "/certs/ca.crt" + And Llama Stack is restarted + When I use "query" to ask question + """ + {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} + """ + Then The status code of the response is 200 + + Scenario: Inference fails with TLS minimum version TLSv1.3 and untrusted CA certificate + Given Llama Stack is configured with TLS minimum version "TLSv1.3" and CA certificate path "/certs/untrusted-ca.crt" + And Llama Stack is restarted + When I use "query" to ask question + """ + {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} + """ + Then The status code of the response is 500 + And The body of the response does not contain Hello from the TLS mock inference server + + Scenario: Inference fails with TLS minimum version TLSv1.3 and hostname mismatch + Given Llama Stack is configured with TLS minimum version "TLSv1.3" and hostname mismatch server + And Llama Stack is restarted + When I use "query" to ask question + """ + {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} + """ + Then The status code of the response is 500 + And The body of the response does not contain Hello from the TLS mock inference server + + Scenario: Inference fails with TLS minimum version TLSv1.3 and expired CA certificate + Given Llama Stack is configured with TLS minimum version "TLSv1.3" and CA certificate path "/certs/expired-ca.crt" + And Llama Stack is restarted + When I use "query" to ask question + """ + {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} + """ + Then The status code of the response is 500 + And The body of the response does not contain Hello from the TLS mock inference server diff --git a/tests/e2e/features/tls.feature b/tests/e2e/features/tls.feature deleted file mode 100644 index a900b1c0f..000000000 --- a/tests/e2e/features/tls.feature +++ /dev/null @@ -1,185 +0,0 @@ -@e2e_group_1 @skip-in-library-mode @skip-in-prow -Feature: TLS configuration for remote inference providers - Validate that Llama Stack's NetworkConfig.tls settings are applied correctly - when connecting to a remote inference provider over HTTPS. - - Background: - Given The service is started locally - And The system is in default state - And REST API service prefix is /v1 - And the Lightspeed stack configuration directory is "tests/e2e/configuration" - And The service uses the lightspeed-stack-tls.yaml configuration - And The service is restarted - And The original Llama Stack config is restored if modified - - Scenario: Inference succeeds with TLS verification disabled - Given Llama Stack is configured with TLS verification disabled - And Llama Stack is restarted - And Lightspeed Stack is restarted - When I use "query" to ask question - """ - {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} - """ - Then The status code of the response is 200 - - Scenario: Inference succeeds with CA certificate verification - Given Llama Stack is configured with CA certificate verification - And Llama Stack is restarted - And Lightspeed Stack is restarted - When I use "query" to ask question - """ - {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} - """ - Then The status code of the response is 200 - - Scenario: Inference fails with an untrusted CA certificate - Given Llama Stack is configured with CA certificate path "/certs/untrusted-ca.crt" - And Llama Stack is restarted - And Lightspeed Stack is restarted - When I use "query" to ask question - """ - {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} - """ - Then The status code of the response is 500 - And The body of the response does not contain Hello from the TLS mock inference server - - Scenario: Inference fails with an expired CA certificate - Given Llama Stack is configured with CA certificate path "/certs/expired-ca.crt" - And Llama Stack is restarted - And Lightspeed Stack is restarted - When I use "query" to ask question - """ - {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} - """ - Then The status code of the response is 500 - And The body of the response does not contain Hello from the TLS mock inference server - - Scenario: Inference fails when TLS verify is true against self-signed cert - Given Llama Stack is configured with TLS verification enabled - And Llama Stack is restarted - And Lightspeed Stack is restarted - When I use "query" to ask question - """ - {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} - """ - Then The status code of the response is 500 - And The body of the response does not contain Hello from the TLS mock inference server - - Scenario: Inference succeeds with mutual TLS authentication - Given Llama Stack is configured with mutual TLS authentication - And Llama Stack is restarted - And Lightspeed Stack is restarted - When I use "query" to ask question - """ - {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} - """ - Then The status code of the response is 200 - - Scenario: Inference fails when mTLS is required but no client certificate is provided - Given Llama Stack is configured for mTLS without client certificate - And Llama Stack is restarted - And Lightspeed Stack is restarted - When I use "query" to ask question - """ - {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} - """ - Then The status code of the response is 500 - And The body of the response does not contain Hello from the TLS mock inference server - - Scenario: Inference fails when mTLS is required but wrong client certificate is provided - Given Llama Stack is configured for mTLS with wrong client certificate - And Llama Stack is restarted - And Lightspeed Stack is restarted - When I use "query" to ask question - """ - {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} - """ - Then The status code of the response is 500 - And The body of the response does not contain Hello from the TLS mock inference server - - Scenario: Inference fails when mTLS is required but untrusted client certificate is provided - Given Llama Stack is configured for mTLS with untrusted client certificate - And Llama Stack is restarted - And Lightspeed Stack is restarted - When I use "query" to ask question - """ - {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} - """ - Then The status code of the response is 500 - And The body of the response does not contain Hello from the TLS mock inference server - - Scenario: Inference fails when mTLS is required but expired client certificate is provided - Given Llama Stack is configured for mTLS with expired client certificate - And Llama Stack is restarted - And Lightspeed Stack is restarted - When I use "query" to ask question - """ - {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} - """ - Then The status code of the response is 500 - And The body of the response does not contain Hello from the TLS mock inference server - - Scenario: Inference fails with CA certificate verification and hostname mismatch - Given Llama Stack is configured with CA certificate and hostname mismatch server - And Llama Stack is restarted - And Lightspeed Stack is restarted - When I use "query" to ask question - """ - {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} - """ - Then The status code of the response is 500 - And The body of the response does not contain Hello from the TLS mock inference server - - Scenario: Inference fails with mutual TLS and hostname mismatch - Given Llama Stack is configured with mutual TLS and hostname mismatch server - And Llama Stack is restarted - And Lightspeed Stack is restarted - When I use "query" to ask question - """ - {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} - """ - Then The status code of the response is 500 - And The body of the response does not contain Hello from the TLS mock inference server - - Scenario: Inference succeeds with TLS minimum version TLSv1.3 - Given Llama Stack is configured with TLS minimum version "TLSv1.3" and CA certificate path "/certs/ca.crt" - And Llama Stack is restarted - And Lightspeed Stack is restarted - When I use "query" to ask question - """ - {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} - """ - Then The status code of the response is 200 - - Scenario: Inference fails with TLS minimum version TLSv1.3 and untrusted CA certificate - Given Llama Stack is configured with TLS minimum version "TLSv1.3" and CA certificate path "/certs/untrusted-ca.crt" - And Llama Stack is restarted - And Lightspeed Stack is restarted - When I use "query" to ask question - """ - {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} - """ - Then The status code of the response is 500 - And The body of the response does not contain Hello from the TLS mock inference server - - Scenario: Inference fails with TLS minimum version TLSv1.3 and hostname mismatch - Given Llama Stack is configured with TLS minimum version "TLSv1.3" and hostname mismatch server - And Llama Stack is restarted - And Lightspeed Stack is restarted - When I use "query" to ask question - """ - {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} - """ - Then The status code of the response is 500 - And The body of the response does not contain Hello from the TLS mock inference server - - Scenario: Inference fails with TLS minimum version TLSv1.3 and expired CA certificate - Given Llama Stack is configured with TLS minimum version "TLSv1.3" and CA certificate path "/certs/expired-ca.crt" - And Llama Stack is restarted - And Lightspeed Stack is restarted - When I use "query" to ask question - """ - {"query": "Say hello", "model": "mock-tls-model", "provider": "tls-openai"} - """ - Then The status code of the response is 500 - And The body of the response does not contain Hello from the TLS mock inference server diff --git a/tests/e2e/mock_tls_inference_server/server.py b/tests/e2e/mock_tls_inference_server/server.py index bfb4cbae5..25bd23a0c 100644 --- a/tests/e2e/mock_tls_inference_server/server.py +++ b/tests/e2e/mock_tls_inference_server/server.py @@ -13,6 +13,7 @@ import datetime import json +import os import ssl import threading import time @@ -29,6 +30,25 @@ MTLS_PORT = 8444 HOSTNAME_MISMATCH_PORT = 8445 +_DEFAULT_SERVER_CERT_DNS_NAMES: tuple[str, ...] = ( + "mock-tls-inference", + "localhost", + "127.0.0.1", +) + + +def _server_cert_dns_names() -> tuple[str, ...]: + """Return DNS identities for the main server certificate. + + Reads comma-separated ``TLS_CERT_DNS_NAMES`` (set in Konflux/Prow manifest). + Falls back to Docker Compose defaults when unset. + """ + raw = os.environ.get("TLS_CERT_DNS_NAMES", "").strip() + if not raw: + return _DEFAULT_SERVER_CERT_DNS_NAMES + names = tuple(name.strip() for name in raw.split(",") if name.strip()) + return names or _DEFAULT_SERVER_CERT_DNS_NAMES + class OpenAIHandler(BaseHTTPRequestHandler): """Handles OpenAI-compatible API requests over HTTPS.""" @@ -221,8 +241,9 @@ def main() -> None: # Generate CA and certificates ca = trustme.CA() - # Server cert with SANs for Docker service name and localhost - server_cert = ca.issue_cert("mock-tls-inference", "localhost", "127.0.0.1") + server_dns_names = _server_cert_dns_names() + print(f" Server cert DNS names: {', '.join(server_dns_names)}") + server_cert = ca.issue_cert(*server_dns_names) # Client cert for mTLS testing (use a simple hostname without spaces) client_cert = ca.issue_cert("tls-e2e-test-client") diff --git a/tests/e2e/test_list.txt b/tests/e2e/test_list.txt index 34e1b8647..bc04afff3 100644 --- a/tests/e2e/test_list.txt +++ b/tests/e2e/test_list.txt @@ -25,5 +25,7 @@ features/mcp_servers_api.feature features/mcp_servers_api_auth.feature features/mcp_servers_api_no_config.feature features/proxy.feature -features/tls.feature -features/opentelemetry.feature +features/tls-ca.feature +features/tls-mtls.feature +features/tls-tlsv13.feature +features/opentelemetry.feature \ No newline at end of file diff --git a/tests/e2e/utils/llama_config_utils.py b/tests/e2e/utils/llama_config_utils.py index eb5f67b9d..e8fdf4832 100644 --- a/tests/e2e/utils/llama_config_utils.py +++ b/tests/e2e/utils/llama_config_utils.py @@ -3,6 +3,7 @@ import os import shutil import tempfile +from pathlib import Path from typing import Any, Optional import yaml @@ -20,6 +21,25 @@ _llama_config_backup_key: dict[str, Optional[str]] = {"value": None} +def clear_llama_config_backup() -> None: + """Drop in-memory run.yaml backup (e.g. at start of tls.feature).""" + _llama_config_backup_key["value"] = None + + +def reset_llama_run_config_to_pipeline_default() -> None: + """Reset llama-stack-config run.yaml to Konflux/Prow pipeline seed (run-ci.yaml).""" + if not is_prow_environment(): + return + run_ci = ( + Path(__file__).resolve().parents[1] / "configs" / "run-ci.yaml" + ) + if not run_ci.is_file(): + print(f"WARN: pipeline run.yaml seed not found at {run_ci}", flush=True) + return + print(f"Resetting llama-stack-config from {run_ci.name}...", flush=True) + update_llama_run_configmap(str(run_ci)) + + def _local_llama_config_path() -> str: """Return local run.yaml path for Docker/local e2e execution.""" return os.getenv("E2E_LLAMA_CONFIG_PATH", _DEFAULT_LOCAL_LLAMA_CONFIG_PATH) diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py index ff771904b..aaaba1c7f 100644 --- a/tests/e2e/utils/prow_utils.py +++ b/tests/e2e/utils/prow_utils.py @@ -93,11 +93,18 @@ def restart_pod(container_name: str) -> None: """ if container_name in _LLAMA_RESTART_NAMES: op = "restart-llama-stack" - timeout = 420 + # Subprocess cap must exceed e2e-ops internal waits (pod + in-pod health + port-forward). + # Konflux TLS full recreate: ~6–12 min typical, 15+ min under load (user-reported 400s+). + if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1": + timeout = 1200 + elif os.environ.get("E2E_KONFLUX_E2E") == "1": + timeout = 720 + else: + timeout = 420 elif container_name in _LIGHTSPEED_RESTART_NAMES: op = "restart-lightspeed" - # Pod wait (up to ~120s) + port-forward retries + slow Konflux/Prow clusters. - timeout = 320 + # Konflux LCS: TCP readiness + Llama handshake; full recreate can exceed 10 min under load. + timeout = 1200 if os.environ.get("E2E_KONFLUX_E2E") == "1" else 320 else: print( f"Warning: restart_pod({container_name!r}) unknown; " @@ -110,11 +117,17 @@ def restart_pod(container_name: str) -> None: print(result.stdout, end="") if result.returncode != 0: print(result.stderr, end="") - detail = (result.stderr or result.stdout or "").strip() + combined = f"{result.stdout or ''}\n{result.stderr or ''}".strip() + # Prefer full e2e-ops output when diagnostics were printed (TLS/Llama failures). + if "========== failure logs:" in combined: + detail = combined + else: + detail = "\n".join(combined.splitlines()[-40:]) if combined else "" + detail = detail or f"exit {result.returncode}" raise subprocess.CalledProcessError( result.returncode, op, - detail or None, + detail, ) except subprocess.TimeoutExpired as e: print(f"Failed to restart pod {container_name}: {e}") @@ -128,8 +141,11 @@ def restore_llama_stack_pod() -> None: subprocess.CalledProcessError: If oc/e2e-ops restore fails. subprocess.TimeoutExpired: If the operation times out. """ - # wait_for_pod (up to ~180s) + in-pod /v1/health polling (~105s) — allow headroom. - result = run_e2e_ops("restart-llama-stack", timeout=420) + if os.environ.get("E2E_KONFLUX_E2E") == "1": + timeout = 720 + else: + timeout = 420 + result = run_e2e_ops("restart-llama-stack", timeout=timeout) print(result.stdout, end="") if result.returncode != 0: print(result.stderr, end="")