diff --git a/e2e/scenario_test.go b/e2e/scenario_test.go index 0e539ab9a14..f824d7c0742 100644 --- a/e2e/scenario_test.go +++ b/e2e/scenario_test.go @@ -2429,6 +2429,41 @@ func Test_Ubuntu2404_SecureTLSBootstrapping_BootstrapToken_Fallback(t *testing.T }) } +// Test_Ubuntu2204_SecureTLSBootstrapping_APIServerIPEnvVar validates that the +// CSE shell code (configureAndStartSecureTLSBootstrapping in cse_config.sh) +// resolves the API server IP at provisioning time and writes it as +// APISERVER_IP= into /etc/default/secure-tls-bootstrap. +// +// Tracking: AB#38327357. The companion STLS client change in +// Azure/aks-secure-tls-bootstrap reads this env var and dials the IP literal +// directly so the gRPC dns:/// resolver is never consulted on retries. +// +// This test validates only the AgentBaker side (the env var is correctly +// populated). The end-to-end "DNS blackhole, STLS still succeeds" test +// requires the new STLS client binary baked into the VHD and is tracked as +// a follow-up. +func Test_Ubuntu2204_SecureTLSBootstrapping_APIServerIPEnvVar(t *testing.T) { + RunScenario(t, &Scenario{ + Description: "validates that CSE writes APISERVER_IP into /etc/default/secure-tls-bootstrap so STLS can dial without DNS", + Config: Config{ + Cluster: ClusterKubenet, + VHD: config.VHDUbuntu2204Gen2Containerd, + BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { + nbc.SecureTLSBootstrappingConfig = &datamodel.SecureTLSBootstrappingConfig{ + Enabled: true, + } + }, + Validator: func(ctx context.Context, s *Scenario) { + // The env-var line must be present and assign a non-empty value. + // The resolver block falls back through IMDS tag -> getent ahostsv4 + // -> getent ahostsv6 and writes nothing if all sources fail, so a + // missing line would indicate a hard regression. + ValidateFileHasContent(ctx, s, "/etc/default/secure-tls-bootstrap", "APISERVER_IP=") + }, + }, + }) +} + func Test_Ubuntu2404Gen2_GPUNoDriver(t *testing.T) { RunScenario(t, &Scenario{ Description: "Tests that a GPU-enabled node using the Ubuntu 2404 VHD opting for skipping gpu driver installation can be properly bootstrapped", diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index 136da897279..ad0eb5378ce 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -548,6 +548,49 @@ configureAndStartSecureTLSBootstrapping() { BOOTSTRAP_CLIENT_FLAGS="${BOOTSTRAP_CLIENT_FLAGS} --deadline=${SECURE_TLS_BOOTSTRAPPING_DEADLINE}" fi + # AB#38327357: resolve the apiserver IP locally and hand it to STLS via the + # APISERVER_IP env var so the client can dial the literal IP and skip the + # gRPC dns:/// resolver. If anything fails the var stays empty, the line + # is omitted, and STLS falls back to its existing FQDN dial path. Best + # effort only — must never fail CSE. + APISERVER_IP="" + case "${API_SERVER_NAME}" in + ''|*[!0-9.]*) + # Not a plain IPv4 literal. Try the IMDS aksAPIServerIPAddress tag + # (private clusters only — same source reconcile-private-hosts.sh + # uses for privatelink FQDNs), then DNS via getent. + case "${API_SERVER_NAME}" in + *.privatelink.*) + APISERVER_IP=$(curl -sSL -m 5 -H "Metadata: true" \ + "http://169.254.169.254/metadata/instance/compute/tags?api-version=2019-03-11&format=text" 2>/dev/null \ + | tr ';' '\n' \ + | awk -F: 'tolower($1) == "aksapiserveripaddress" { print $2; exit }') + # Discard IMDS values that are not plausible IP literals so + # we fall through to getent instead of short-circuiting on + # an invalid (or absent) tag. + case "${APISERVER_IP}" in + ''|*[!0-9a-fA-F:.]*) APISERVER_IP="" ;; + esac + ;; + esac + if [ -z "${APISERVER_IP}" ] && [ -n "${API_SERVER_NAME}" ]; then + APISERVER_IP=$(getent ahostsv4 "${API_SERVER_NAME}" 2>/dev/null | awk '/STREAM/ { print $1; exit }') + fi + if [ -z "${APISERVER_IP}" ] && [ -n "${API_SERVER_NAME}" ]; then + APISERVER_IP=$(getent ahostsv6 "${API_SERVER_NAME}" 2>/dev/null | awk '/STREAM/ { print $1; exit }') + fi + # Final sanity: discard anything that isn't a plausible IP literal. + # The STLS client also validates with net.ParseIP, but reject early + # so we don't write garbage into the EnvironmentFile. + case "${APISERVER_IP}" in + ''|*[!0-9a-fA-F:.]*) APISERVER_IP="" ;; + esac + ;; + *) + APISERVER_IP="${API_SERVER_NAME}" + ;; + esac + mkdir -p "$(dirname "${SECURE_TLS_BOOTSTRAPPING_DEFAULT_FILE}")" touch "${SECURE_TLS_BOOTSTRAPPING_DEFAULT_FILE}" chmod 0600 "${SECURE_TLS_BOOTSTRAPPING_DEFAULT_FILE}" @@ -555,6 +598,9 @@ configureAndStartSecureTLSBootstrapping() { if [ -n "${AZURE_ENVIRONMENT_FILEPATH}" ]; then echo "AZURE_ENVIRONMENT_FILEPATH=${AZURE_ENVIRONMENT_FILEPATH}" >> "${SECURE_TLS_BOOTSTRAPPING_DEFAULT_FILE}" fi + if [ -n "${APISERVER_IP}" ]; then + echo "APISERVER_IP=${APISERVER_IP}" >> "${SECURE_TLS_BOOTSTRAPPING_DEFAULT_FILE}" + fi mkdir -p "$(dirname "${SECURE_TLS_BOOTSTRAPPING_DROP_IN}")" touch "${SECURE_TLS_BOOTSTRAPPING_DROP_IN}" diff --git a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh index 3b7c970bb3d..d35f9aedd5d 100755 --- a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh @@ -1227,6 +1227,17 @@ SETUP_EOF echo "chmod $@" } + # AB#38327357: stub external resolvers so the default tests do not depend on + # CI DNS / IMDS reachability. Per-test overrides further down provide + # specific responses for the new IP-resolution cases. + curl() { + return 1 + } + + getent() { + return 2 + } + cleanup() { rm -rf "$SECURE_TLS_BOOTSTRAPPING_DROP_IN_DIR" rm -rf "$SECURE_TLS_BOOTSTRAPPING_DEFAULT_FILE_DIR" @@ -1250,6 +1261,7 @@ SETUP_EOF The contents of file "secure-tls-bootstrap.service.d/10-securetlsbootstrap.conf" should include "WantedBy=kubelet.service" The contents of file "default/secure-tls-bootstrap" should include 'BOOTSTRAP_FLAGS=--aad-resource=6dae42f8-4368-4678-94ff-3960e28e3630 --apiserver-fqdn=fqdn --cloud-provider-config=/etc/kubernetes/azure.json' The contents of file "default/secure-tls-bootstrap" should not include 'AZURE_ENVIRONMENT_FILEPATH' + The contents of file "default/secure-tls-bootstrap" should not include 'APISERVER_IP=' The status should be success End @@ -1262,6 +1274,7 @@ SETUP_EOF The output should include "systemctlEnableAndStartNoBlock secure-tls-bootstrap 30" The contents of file "default/secure-tls-bootstrap" should include 'BOOTSTRAP_FLAGS=--aad-resource=6dae42f8-4368-4678-94ff-3960e28e3630 --apiserver-fqdn=fqdn --cloud-provider-config=/etc/kubernetes/azure.json' The contents of file "default/secure-tls-bootstrap" should include 'AZURE_ENVIRONMENT_FILEPATH=/etc/kubernetes/akscustom.json' + The contents of file "default/secure-tls-bootstrap" should not include 'APISERVER_IP=' The status should be success End @@ -1291,6 +1304,138 @@ SETUP_EOF The contents of file "default/secure-tls-bootstrap" should include 'BOOTSTRAP_FLAGS=--aad-resource=custom-resource --apiserver-fqdn=fqdn --cloud-provider-config=/etc/kubernetes/azure.json --user-assigned-identity-id=custom-identity-id --validate-kubeconfig-timeout=custom-validate-kubeconfig-timeout --get-access-token-timeout=custom-get-access-token-timeout --get-instance-data-timeout=custom-get-instance-data-timeout --get-nonce-timeout=custom-get-nonce-timeout --get-attested-data-timeout=custom-get-attested-data-timeout --get-credential-timeout=custom-get-credential-timeout --deadline=custom-deadline' The status should be success End + + # AB#38327357: APISERVER_IP resolution coverage. The new resolver runs + # before the EnvironmentFile is written so STLS can dial the apiserver + # IP directly and bypass gRPC's dns resolver when node DNS is broken. + It 'should write APISERVER_IP as-is when API_SERVER_NAME is already an IPv4 literal' + systemctlEnableAndStartNoBlock() { + echo "systemctlEnableAndStartNoBlock $@" + } + API_SERVER_NAME="10.0.0.5" + When call configureAndStartSecureTLSBootstrapping + The output should include "systemctlEnableAndStartNoBlock secure-tls-bootstrap 30" + The contents of file "default/secure-tls-bootstrap" should include 'APISERVER_IP=10.0.0.5' + The status should be success + End + + It 'should resolve APISERVER_IP via getent ahostsv4 when DNS works' + systemctlEnableAndStartNoBlock() { + echo "systemctlEnableAndStartNoBlock $@" + } + getent() { + # First arg is the database (ahostsv4 / ahostsv6). + if [ "$1" = "ahostsv4" ]; then + printf '10.0.0.6 STREAM example.hcp.eastus.azmk8s.io\n10.0.0.6 DGRAM\n10.0.0.6 RAW\n' + return 0 + fi + return 2 + } + API_SERVER_NAME="example.hcp.eastus.azmk8s.io" + When call configureAndStartSecureTLSBootstrapping + The output should include "systemctlEnableAndStartNoBlock secure-tls-bootstrap 30" + The contents of file "default/secure-tls-bootstrap" should include 'APISERVER_IP=10.0.0.6' + The status should be success + End + + It 'should fall back to getent ahostsv6 when ahostsv4 has no answer' + systemctlEnableAndStartNoBlock() { + echo "systemctlEnableAndStartNoBlock $@" + } + getent() { + if [ "$1" = "ahostsv6" ]; then + printf '2603:1030::1 STREAM v6only.hcp.eastus.azmk8s.io\n' + return 0 + fi + return 2 + } + API_SERVER_NAME="v6only.hcp.eastus.azmk8s.io" + When call configureAndStartSecureTLSBootstrapping + The output should include "systemctlEnableAndStartNoBlock secure-tls-bootstrap 30" + The contents of file "default/secure-tls-bootstrap" should include 'APISERVER_IP=2603:1030::1' + The status should be success + End + + It 'should prefer the IMDS aksAPIServerIPAddress tag for privatelink FQDNs' + systemctlEnableAndStartNoBlock() { + echo "systemctlEnableAndStartNoBlock $@" + } + curl() { + # IMDS returns key:value pairs separated by semicolons. + echo "aksAPIServerIPAddress:10.224.0.4;otherTag:someValue" + return 0 + } + getent() { + # Must not be needed once IMDS hits; if called, fail loudly. + echo "getent should not have been called" >&2 + return 1 + } + API_SERVER_NAME="example.privatelink.eastus.azmk8s.io" + When call configureAndStartSecureTLSBootstrapping + The output should include "systemctlEnableAndStartNoBlock secure-tls-bootstrap 30" + The contents of file "default/secure-tls-bootstrap" should include 'APISERVER_IP=10.224.0.4' + The status should be success + End + + It 'should fall back to DNS when IMDS returns no aksAPIServerIPAddress tag' + systemctlEnableAndStartNoBlock() { + echo "systemctlEnableAndStartNoBlock $@" + } + curl() { + echo "otherTag:someValue;anotherTag:moreData" + return 0 + } + getent() { + if [ "$1" = "ahostsv4" ]; then + printf '10.224.0.5 STREAM example.privatelink.eastus.azmk8s.io\n' + return 0 + fi + return 2 + } + API_SERVER_NAME="example.privatelink.eastus.azmk8s.io" + When call configureAndStartSecureTLSBootstrapping + The output should include "systemctlEnableAndStartNoBlock secure-tls-bootstrap 30" + The contents of file "default/secure-tls-bootstrap" should include 'APISERVER_IP=10.224.0.5' + The status should be success + End + + It 'should omit APISERVER_IP when every resolver fails' + systemctlEnableAndStartNoBlock() { + echo "systemctlEnableAndStartNoBlock $@" + } + # curl + getent inherit the Describe-level stubs that return failure. + API_SERVER_NAME="unresolvable.example.com" + When call configureAndStartSecureTLSBootstrapping + The output should include "systemctlEnableAndStartNoBlock secure-tls-bootstrap 30" + The contents of file "default/secure-tls-bootstrap" should include 'BOOTSTRAP_FLAGS=' + The contents of file "default/secure-tls-bootstrap" should not include 'APISERVER_IP=' + The status should be success + End + + It 'should reject IMDS responses that are not plausible IP literals' + systemctlEnableAndStartNoBlock() { + echo "systemctlEnableAndStartNoBlock $@" + } + curl() { + # Garbage / injected value masquerading as a tag value. + echo "aksAPIServerIPAddress:not-an-ip!@#" + return 0 + } + getent() { + if [ "$1" = "ahostsv4" ]; then + printf '10.224.0.9 STREAM example.privatelink.eastus.azmk8s.io\n' + return 0 + fi + return 2 + } + API_SERVER_NAME="example.privatelink.eastus.azmk8s.io" + When call configureAndStartSecureTLSBootstrapping + The output should include "systemctlEnableAndStartNoBlock secure-tls-bootstrap 30" + # Garbage discarded, falls through to getent. + The contents of file "default/secure-tls-bootstrap" should include 'APISERVER_IP=10.224.0.9' + The contents of file "default/secure-tls-bootstrap" should not include 'APISERVER_IP=not-an-ip' + The status should be success + End End Describe 'configureKubeletAndKubectl'