diff --git a/aks-node-controller/app.go b/aks-node-controller/app.go index 40d42acb73a..41623ab19fe 100644 --- a/aks-node-controller/app.go +++ b/aks-node-controller/app.go @@ -43,11 +43,18 @@ type App struct { eventLogger *helpers.EventLogger // hotfixVersionPath overrides the default hotfix version file location for testing. + // It is also the path check-hotfix writes the resolved pointer to. hotfixVersionPath string // aptSourcesDir overrides the default APT sources directory for testing. aptSourcesDir string // nodeCustomDataPath overrides the default nodecustomdata path for testing. nodeCustomDataPath string + // nodeConfigPath overrides the default AKSNodeConfig path for testing. It is the + // source for check-hotfix's apiserver credentials and cold-start fallback pointer. + nodeConfigPath string + // checkHotfixConfigMapFetcher overrides the real apiserver ConfigMap GET for testing, + // letting unit tests inject canned ConfigMap JSON or errors without real networking. + checkHotfixConfigMapFetcher func(ctx context.Context) ([]byte, error) } // provision.json values are emitted as strings by the shell jq invocation. @@ -137,6 +144,16 @@ func (a *App) Run(ctx context.Context, args []string) int { return a.runDownloadHotfixCommand(ctx) }, }, + { + Name: "check-hotfix", + Usage: "Read the anc-hotfix-version ConfigMap and stage the hotfix pointer (fail-open)", + Action: func(ctx context.Context, cmd *cli.Command) error { + if len(cmd.Args().Slice()) > 0 { + return fmt.Errorf("unexpected check-hotfix arguments: %s", strings.Join(cmd.Args().Slice(), " ")) + } + return a.runCheckHotfixCommand(ctx) + }, + }, }, } diff --git a/aks-node-controller/checkhotfix.go b/aks-node-controller/checkhotfix.go new file mode 100644 index 00000000000..33c5b109d9a --- /dev/null +++ b/aks-node-controller/checkhotfix.go @@ -0,0 +1,407 @@ +package main + +import ( + "context" + "crypto/tls" + "crypto/x509" + "encoding/base64" + "encoding/json" + "fmt" + "io" + "log/slog" + "net/http" + "os" + "path/filepath" + "strings" + "time" + + "github.com/Azure/agentbaker/aks-node-controller/helpers" + "github.com/Azure/agentbaker/aks-node-controller/pkg/nodeconfigutils" +) + +// check-hotfix reads the live-patching-controller's hotfix pointer from the +// kube-system/anc-hotfix-version ConfigMap and writes it to the same path +// download-hotfix already reads. download-hotfix then re-resolves the pointer +// against the node's baked ANC version and keeps its unchanged patch-only, +// strictly-higher gating. check-hotfix only fetches and stages the pointer; it +// never installs anything and never blocks provisioning (fail-open). +const ( + // hotfixConfigMapNamespace and hotfixConfigMapName identify the ConfigMap the + // live-patching-controller publishes the base->hotfix pointer map into. + hotfixConfigMapNamespace = "kube-system" + hotfixConfigMapName = "anc-hotfix-version" + + // hotfixConfigMapDataKey is the documented .data key holding the full + // {"hotfixes":{...}} JSON object. When absent we fall back to the single/only + // entry in .data (see parseConfigMapHotfixConfig). Per-base keys are not used. + hotfixConfigMapDataKey = "hotfixes.json" + + // apiServerHTTPS is the standard apiserver port used for the FQDN derived from the AKSNodeConfig. + apiServerHTTPSPort = "443" + + // configMapFetchTimeout caps the apiserver round-trip so a hung/slow apiserver + // never delays provisioning. + configMapFetchTimeout = 10 * time.Second +) + +// checkHotfixOutcome is the telemetry taxonomy emitted under TaskName "CheckHotfix". +type checkHotfixOutcome string + +const ( + // outcomeConfigMapRead: ConfigMap fetched + parsed OK and a hotfix entry matched this node's base. + outcomeConfigMapRead checkHotfixOutcome = "configMapRead" + // outcomeNoHotfixForBase: ConfigMap read OK but no entry matched this node's YYYYMM.DD base. + outcomeNoHotfixForBase checkHotfixOutcome = "noHotfixForBase" + // outcomeCustomDataFallback: ConfigMap read failed; the embedded customdata pointer was used. + outcomeCustomDataFallback checkHotfixOutcome = "customDataFallback" + // outcomeFailed: everything failed; nothing was staged. Provisioning still proceeds (exit 0). + outcomeFailed checkHotfixOutcome = "failed" +) + +// k8sConfigMap is the minimal shape of a Kubernetes ConfigMap GET response. ConfigMap +// .data values are strings, so the hotfix pointer is a JSON object encoded as a string. +type k8sConfigMap struct { + Data map[string]string `json:"data"` +} + +// runCheckHotfixCommand is the cli Action for `check-hotfix`. It ALWAYS returns nil so +// provisioning is never blocked: any error (404, 403, timeout, parse failure) is logged, +// emitted as telemetry, and swallowed. Internal helpers return errors for testability only. +func (a *App) runCheckHotfixCommand(ctx context.Context) error { + slog.Info("aks-node-controller check-hotfix started") + startTime := time.Now() + + outcome, err := a.checkHotfix(ctx) + + endTime := time.Now() + level := helpersEventLevel(outcome) + message := fmt.Sprintf("check-hotfix outcome=%s", outcome) + if err != nil { + message = fmt.Sprintf("%s error=%s", message, err.Error()) + slog.Warn("check-hotfix completed with error (fail-open)", "outcome", outcome, "error", err) + } else { + slog.Info("check-hotfix completed", "outcome", outcome) + } + if a.eventLogger != nil { + a.eventLogger.LogEvent("CheckHotfix", message, level, startTime, endTime) + } + + // Fail-open: never propagate an error so the cli exit code stays 0. + return nil +} + +// checkHotfix performs the fetch/parse/stage workflow and reports a telemetry outcome. +// It is fail-open by contract: the only caller (runCheckHotfixCommand) swallows the error. +func (a *App) checkHotfix(ctx context.Context) (checkHotfixOutcome, error) { + hotfixPath := a.hotfixVersionPath + if hotfixPath == "" { + hotfixPath = defaultHotfixVersionPath + } + + data, fetchErr := a.fetchHotfixConfigMap(ctx) + if fetchErr != nil { + // ConfigMap read failed: fall back to the pointer embedded in the node config + // (cold-start path). See coldStartHotfixConfig for the contract TODO. + slog.Warn("failed to read anc-hotfix-version ConfigMap, attempting cold-start fallback", + "error", fetchErr) + cfg, ok, coldErr := a.coldStartHotfixConfig() + if coldErr != nil { + return outcomeFailed, fmt.Errorf("configmap fetch failed (%v) and cold-start fallback failed: %w", fetchErr, coldErr) + } + if !ok { + return outcomeFailed, fmt.Errorf("configmap fetch failed and no cold-start pointer present: %w", fetchErr) + } + if err := writeHotfixConfig(hotfixPath, cfg); err != nil { + return outcomeFailed, fmt.Errorf("writing cold-start hotfix config: %w", err) + } + return outcomeCustomDataFallback, nil + } + + cfg, err := parseConfigMapHotfixConfig(data) + if err != nil { + return outcomeFailed, fmt.Errorf("parsing anc-hotfix-version ConfigMap: %w", err) + } + + if err := writeHotfixConfig(hotfixPath, cfg); err != nil { + return outcomeFailed, fmt.Errorf("writing hotfix config: %w", err) + } + + // Report whether this node's base actually has a pointer. download-hotfix still + // performs the authoritative patch-only-strictly-higher gating; this is telemetry only. + if cfg.resolveVersion(Version) == "" { + return outcomeNoHotfixForBase, nil + } + return outcomeConfigMapRead, nil +} + +// helpersEventLevel maps a check-hotfix outcome to a guest-agent event level. Only the +// terminal "failed" outcome is reported as an error; the rest are informational because +// the command is fail-open and provisioning continues regardless. +func helpersEventLevel(outcome checkHotfixOutcome) helpers.EventLevel { + if outcome == outcomeFailed { + return helpers.EventLevelError + } + return helpers.EventLevelInformational +} + +// fetchHotfixConfigMap returns the raw ConfigMap GET body. Tests inject +// checkHotfixConfigMapFetcher to supply canned bytes or errors without networking. +func (a *App) fetchHotfixConfigMap(ctx context.Context) ([]byte, error) { + if a.checkHotfixConfigMapFetcher != nil { + return a.checkHotfixConfigMapFetcher(ctx) + } + return a.fetchConfigMapFromAPIServer(ctx) +} + +// apiServerCreds holds the endpoint and credentials needed to reach the apiserver. +type apiServerCreds struct { + // server is the apiserver host[:port] without scheme. + server string + // token, when set, is sent as an Authorization: Bearer header. + token string + // caPEM is the cluster CA used to verify the apiserver certificate. + caPEM []byte +} + +// fetchConfigMapFromAPIServer performs the real network GET against the apiserver. It +// resolves credentials from the AKSNodeConfig (apiserver FQDN + TLS bootstrap token + CA), +// builds a short-timeout TLS client trusting the cluster CA, and returns the raw response +// body. Non-2xx responses are surfaced as errors so the caller fails open. +func (a *App) fetchConfigMapFromAPIServer(ctx context.Context) ([]byte, error) { + creds, err := a.resolveAPIServerCreds() + if err != nil { + return nil, fmt.Errorf("resolving apiserver credentials: %w", err) + } + + client, err := buildAPIServerHTTPClient(creds) + if err != nil { + return nil, fmt.Errorf("building apiserver http client: %w", err) + } + + url := fmt.Sprintf("https://%s/api/v1/namespaces/%s/configmaps/%s", + creds.server, hotfixConfigMapNamespace, hotfixConfigMapName) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return nil, fmt.Errorf("building request: %w", err) + } + if creds.token != "" { + req.Header.Set("Authorization", "Bearer "+creds.token) + } + req.Header.Set("Accept", "application/json") + + resp, err := client.Do(req) + if err != nil { + return nil, fmt.Errorf("GET %s: %w", url, err) + } + defer resp.Body.Close() + + body, err := io.ReadAll(io.LimitReader(resp.Body, 1<<20)) + if err != nil { + return nil, fmt.Errorf("reading response body: %w", err) + } + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return nil, fmt.Errorf("apiserver returned status %d for %s", resp.StatusCode, url) + } + return body, nil +} + +// resolveAPIServerCreds gathers the apiserver endpoint + credentials from the AKSNodeConfig +// that ANC already parses: the apiserver FQDN, the TLS bootstrap token, and the cluster CA. +// +// check-hotfix runs before the provisioning scripts (cse_config.sh), so the on-node kubelet +// kubeconfigs (/var/lib/kubelet/*) and the decoded CA file (/etc/kubernetes/certs/ca.crt) do +// not exist yet -- they are written later during provisioning. The node config is the only +// credential source guaranteed to be present at this point, and it already carries everything +// we need (same auth pattern as the previous proxy, minus the proxy). +func (a *App) resolveAPIServerCreds() (apiServerCreds, error) { + return a.credsFromNodeConfig() +} + +// credsFromNodeConfig builds creds from the AKSNodeConfig: apiserver FQDN, TLS bootstrap +// token, and the cluster CA. The CA is carried in the node config as base64-encoded PEM +// (the same value cse_config.sh later base64-decodes into /etc/kubernetes/certs/ca.crt). +func (a *App) credsFromNodeConfig() (apiServerCreds, error) { + path := a.getNodeConfigPath() + raw, err := os.ReadFile(path) + if err != nil { + return apiServerCreds{}, fmt.Errorf("reading node config %s: %w", path, err) + } + cfg, err := nodeconfigutils.UnmarshalConfigurationV1(raw) + if err != nil { + // Forward-compatible parse: unknown fields are discarded, so a non-nil error here + // means the document was unusable. Continue to evaluate what we did parse. + slog.Info("node config parsed with errors, continuing with partial config", "error", err) + } + server := strings.TrimSpace(cfg.GetApiServerConfig().GetApiServerName()) + if server == "" { + return apiServerCreds{}, fmt.Errorf("node config has no api_server_config.api_server_name") + } + token := strings.TrimSpace(cfg.GetBootstrappingConfig().GetTlsBootstrappingToken()) + if token == "" { + return apiServerCreds{}, fmt.Errorf("node config has no bootstrapping_config.tls_bootstrapping_token") + } + caB64 := strings.TrimSpace(cfg.GetKubernetesCaCert()) + if caB64 == "" { + return apiServerCreds{}, fmt.Errorf("node config has no kubernetes_ca_cert") + } + caPEM, err := base64.StdEncoding.DecodeString(caB64) + if err != nil { + return apiServerCreds{}, fmt.Errorf("decoding node config kubernetes_ca_cert: %w", err) + } + return apiServerCreds{ + server: ensurePort(server, apiServerHTTPSPort), + token: token, + caPEM: caPEM, + }, nil +} + +// buildAPIServerHTTPClient builds an *http.Client trusting the cluster CA, with a short +// timeout so provisioning is never delayed. +func buildAPIServerHTTPClient(creds apiServerCreds) (*http.Client, error) { + tlsConfig := &tls.Config{MinVersion: tls.VersionTLS12} + if len(creds.caPEM) > 0 { + pool := x509.NewCertPool() + if !pool.AppendCertsFromPEM(creds.caPEM) { + return nil, fmt.Errorf("failed to parse cluster CA PEM") + } + tlsConfig.RootCAs = pool + } + return &http.Client{ + Timeout: configMapFetchTimeout, + Transport: &http.Transport{TLSClientConfig: tlsConfig}, + }, nil +} + +// parseConfigMapHotfixConfig extracts the hotfix pointer from a ConfigMap GET body. Q1 +// decision: .data holds the full {"hotfixes":{...}} JSON object under a SINGLE key. We +// prefer the documented key name (hotfixConfigMapDataKey); if absent we use the single/only +// entry. The value unmarshals DIRECTLY into the shared 2.1a hotfixConfig, so check-hotfix +// and download-hotfix share ONE identical parser and data contract. +func parseConfigMapHotfixConfig(data []byte) (hotfixConfig, error) { + var cm k8sConfigMap + if err := json.Unmarshal(data, &cm); err != nil { + return hotfixConfig{}, fmt.Errorf("unmarshaling ConfigMap: %w", err) + } + if len(cm.Data) == 0 { + return hotfixConfig{}, fmt.Errorf("ConfigMap has no data") + } + + value, ok := cm.Data[hotfixConfigMapDataKey] + if !ok { + if len(cm.Data) != 1 { + return hotfixConfig{}, fmt.Errorf("ConfigMap data has no %q key and %d entries (expected exactly 1)", + hotfixConfigMapDataKey, len(cm.Data)) + } + for _, v := range cm.Data { + value = v + } + } + + value = strings.TrimSpace(value) + if value == "" { + return hotfixConfig{}, fmt.Errorf("ConfigMap hotfix entry is empty") + } + + var cfg hotfixConfig + if err := json.Unmarshal([]byte(value), &cfg); err != nil { + return hotfixConfig{}, fmt.Errorf("unmarshaling hotfix pointer JSON: %w", err) + } + return cfg, nil +} + +// coldStartHotfixConfig reads a LENIENT top-level "hotfixes" object from the AKSNodeConfig +// JSON. This is the PoC cold-start fallback used only when the ConfigMap read fails. +// +// TODO(2.1b): There is no formalized AKSNodeConfig contract field for the embedded pointer +// yet - the absvc/aks-rp side that would populate a typed field is not built. Once that +// contract exists, replace this lenient top-level read with the typed field and drop the +// permissive JSON shape. Until then we read it best-effort and never fail provisioning. +func (a *App) coldStartHotfixConfig() (hotfixConfig, bool, error) { + path := a.getNodeConfigPath() + raw, err := os.ReadFile(path) + if err != nil { + if os.IsNotExist(err) { + return hotfixConfig{}, false, nil + } + return hotfixConfig{}, false, fmt.Errorf("reading node config %s: %w", path, err) + } + + // Lenient parse: the AKSNodeConfig is protojson, but the cold-start pointer is an + // out-of-contract top-level object, so parse it permissively with encoding/json. + var lenient struct { + Hotfixes map[string]string `json:"hotfixes"` + } + if err := json.Unmarshal(raw, &lenient); err != nil { + return hotfixConfig{}, false, fmt.Errorf("parsing cold-start hotfixes from node config: %w", err) + } + if len(lenient.Hotfixes) == 0 { + return hotfixConfig{}, false, nil + } + return hotfixConfig{Hotfixes: lenient.Hotfixes}, true, nil +} + +// writeHotfixConfig writes the resolved config to the path download-hotfix reads, in the +// exact {"hotfixes":{...}} shape so download-hotfix re-resolves and applies its unchanged +// gating. The write is atomic (temp file + rename) so a concurrent reader never sees a +// partial file. +func writeHotfixConfig(path string, cfg hotfixConfig) error { + // Only persist the map shape; the legacy Version field is intentionally omitted so the + // on-disk contract matches what the live-patching-controller ConfigMap publishes. + out := hotfixConfig{Hotfixes: cfg.Hotfixes} + data, err := json.Marshal(out) + if err != nil { + return fmt.Errorf("marshaling hotfix config: %w", err) + } + + dir := filepath.Dir(path) + tmp, err := os.CreateTemp(dir, ".aks-node-controller-hotfix-*") + if err != nil { + return fmt.Errorf("creating temp file in %s: %w", dir, err) + } + tmpPath := tmp.Name() + if _, err := tmp.Write(data); err != nil { + tmp.Close() + os.Remove(tmpPath) + return fmt.Errorf("writing temp file %s: %w", tmpPath, err) + } + if err := tmp.Close(); err != nil { + os.Remove(tmpPath) + return fmt.Errorf("closing temp file %s: %w", tmpPath, err) + } + if err := os.Rename(tmpPath, path); err != nil { + os.Remove(tmpPath) + return fmt.Errorf("renaming %s to %s: %w", tmpPath, path, err) + } + slog.Info("staged hotfix pointer for download-hotfix", "path", path) + return nil +} + +// getNodeConfigPath returns the injectable node-config path, defaulting to the standard +// AKSNodeConfig location that ANC already reads. +func (a *App) getNodeConfigPath() string { + if a.nodeConfigPath != "" { + return a.nodeConfigPath + } + return nodeconfigutils.AKSNodeConfigFilePath +} + +// ensurePort appends ":" to host when it has no port. IPv6 literals already in +// bracketed host:port form ("[::1]:443") are left unchanged. +func ensurePort(host, port string) string { + host = stripScheme(strings.TrimSpace(host)) + if host == "" { + return host + } + // Already has a port (account for IPv6 "[..]:p"). + if i := strings.LastIndex(host, ":"); i != -1 && !strings.Contains(host[i+1:], "]") { + return host + } + return host + ":" + port +} + +// stripScheme removes a leading https:// or http:// scheme from a server URL. +func stripScheme(server string) string { + server = strings.TrimPrefix(server, "https://") + server = strings.TrimPrefix(server, "http://") + return strings.TrimRight(server, "/") +} diff --git a/aks-node-controller/checkhotfix_test.go b/aks-node-controller/checkhotfix_test.go new file mode 100644 index 00000000000..4a05494fbc6 --- /dev/null +++ b/aks-node-controller/checkhotfix_test.go @@ -0,0 +1,388 @@ +package main + +import ( + "context" + "encoding/json" + "errors" + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// makeConfigMapJSON renders a Kubernetes ConfigMap GET body whose .data has the given keys. +func makeConfigMapJSON(t *testing.T, data map[string]string) []byte { + t.Helper() + cm := map[string]any{ + "kind": "ConfigMap", + "apiVersion": "v1", + "metadata": map[string]any{"name": hotfixConfigMapName, "namespace": hotfixConfigMapNamespace}, + "data": data, + } + b, err := json.Marshal(cm) + require.NoError(t, err) + return b +} + +// readStagedConfig reads back the hotfix config check-hotfix wrote. +func readStagedConfig(t *testing.T, path string) hotfixConfig { + t.Helper() + data, err := os.ReadFile(path) + require.NoError(t, err) + var cfg hotfixConfig + require.NoError(t, json.Unmarshal(data, &cfg)) + return cfg +} + +func TestParseConfigMapHotfixConfig(t *testing.T) { + t.Run("documented hotfixes.json key", func(t *testing.T) { + cm := makeConfigMapJSON(t, map[string]string{ + hotfixConfigMapDataKey: `{"hotfixes":{"202604.01":"202604.01.1","202605.01":"202605.01.2"}}`, + }) + cfg, err := parseConfigMapHotfixConfig(cm) + require.NoError(t, err) + assert.Equal(t, map[string]string{"202604.01": "202604.01.1", "202605.01": "202605.01.2"}, cfg.Hotfixes) + }) + + t.Run("single fallback key when documented key absent", func(t *testing.T) { + cm := makeConfigMapJSON(t, map[string]string{ + "some-other-key": `{"hotfixes":{"202604.01":"202604.01.1"}}`, + }) + cfg, err := parseConfigMapHotfixConfig(cm) + require.NoError(t, err) + assert.Equal(t, map[string]string{"202604.01": "202604.01.1"}, cfg.Hotfixes) + }) + + t.Run("multiple keys without documented key is an error", func(t *testing.T) { + cm := makeConfigMapJSON(t, map[string]string{ + "a": `{"hotfixes":{}}`, + "b": `{"hotfixes":{}}`, + }) + _, err := parseConfigMapHotfixConfig(cm) + require.Error(t, err) + assert.Contains(t, err.Error(), "expected exactly 1") + }) + + t.Run("empty data is an error", func(t *testing.T) { + cm := makeConfigMapJSON(t, map[string]string{}) + _, err := parseConfigMapHotfixConfig(cm) + require.Error(t, err) + assert.Contains(t, err.Error(), "no data") + }) + + t.Run("empty value is an error", func(t *testing.T) { + cm := makeConfigMapJSON(t, map[string]string{hotfixConfigMapDataKey: " "}) + _, err := parseConfigMapHotfixConfig(cm) + require.Error(t, err) + assert.Contains(t, err.Error(), "empty") + }) + + t.Run("invalid inner JSON is an error", func(t *testing.T) { + cm := makeConfigMapJSON(t, map[string]string{hotfixConfigMapDataKey: "not json"}) + _, err := parseConfigMapHotfixConfig(cm) + require.Error(t, err) + assert.Contains(t, err.Error(), "unmarshaling hotfix pointer JSON") + }) + + t.Run("invalid configmap JSON is an error", func(t *testing.T) { + _, err := parseConfigMapHotfixConfig([]byte("not a configmap")) + require.Error(t, err) + assert.Contains(t, err.Error(), "unmarshaling ConfigMap") + }) + + t.Run("shares parser with download-hotfix readHotfixConfig", func(t *testing.T) { + // The inner value written by the live-patching-controller must round-trip through + // the SAME shape that download-hotfix's readHotfixConfig consumes. + inner := `{"hotfixes":{"202604.01":"202604.01.3"}}` + cm := makeConfigMapJSON(t, map[string]string{hotfixConfigMapDataKey: inner}) + fromCM, err := parseConfigMapHotfixConfig(cm) + require.NoError(t, err) + + path := filepath.Join(t.TempDir(), "h.json") + require.NoError(t, os.WriteFile(path, []byte(inner), 0644)) + fromFile, err := readHotfixConfig(path) + require.NoError(t, err) + assert.Equal(t, fromFile, fromCM) + }) +} + +func TestCheckHotfix_SuccessReadAndWrite(t *testing.T) { + origVersion := Version + Version = "202604.01.0" + defer func() { Version = origVersion }() + + tt := NewTestApp(t, TestAppConfig{}) + path := filepath.Join(t.TempDir(), "hotfix.json") + tt.App.hotfixVersionPath = path + tt.App.checkHotfixConfigMapFetcher = func(context.Context) ([]byte, error) { + return makeConfigMapJSON(t, map[string]string{ + hotfixConfigMapDataKey: `{"hotfixes":{"202604.01":"202604.01.1"}}`, + }), nil + } + + outcome, err := tt.App.checkHotfix(context.Background()) + require.NoError(t, err) + assert.Equal(t, outcomeConfigMapRead, outcome) + + cfg := readStagedConfig(t, path) + assert.Equal(t, map[string]string{"202604.01": "202604.01.1"}, cfg.Hotfixes) +} + +func TestCheckHotfix_NoHotfixForBase(t *testing.T) { + origVersion := Version + Version = "202607.15.0" // base not present in the ConfigMap + defer func() { Version = origVersion }() + + tt := NewTestApp(t, TestAppConfig{}) + path := filepath.Join(t.TempDir(), "hotfix.json") + tt.App.hotfixVersionPath = path + tt.App.checkHotfixConfigMapFetcher = func(context.Context) ([]byte, error) { + return makeConfigMapJSON(t, map[string]string{ + hotfixConfigMapDataKey: `{"hotfixes":{"202604.01":"202604.01.1"}}`, + }), nil + } + + outcome, err := tt.App.checkHotfix(context.Background()) + require.NoError(t, err) + assert.Equal(t, outcomeNoHotfixForBase, outcome) + + // The full pointer is still staged so download-hotfix re-resolves authoritatively. + cfg := readStagedConfig(t, path) + assert.Equal(t, map[string]string{"202604.01": "202604.01.1"}, cfg.Hotfixes) +} + +func TestCheckHotfix_FetchErrorFailsOpenWithoutFallback(t *testing.T) { + tt := NewTestApp(t, TestAppConfig{}) + path := filepath.Join(t.TempDir(), "hotfix.json") + tt.App.hotfixVersionPath = path + // No node config -> no cold-start fallback available. + tt.App.nodeConfigPath = filepath.Join(t.TempDir(), "nonexistent-config.json") + + cases := map[string]error{ + "404 not found": errors.New("apiserver returned status 404"), + "403 forbidden": errors.New("apiserver returned status 403"), + "timeout": context.DeadlineExceeded, + "connection err": errors.New("dial tcp: connection refused"), + } + for name, fetchErr := range cases { + t.Run(name, func(t *testing.T) { + tt.App.checkHotfixConfigMapFetcher = func(context.Context) ([]byte, error) { + return nil, fetchErr + } + outcome, err := tt.App.checkHotfix(context.Background()) + assert.Equal(t, outcomeFailed, outcome) + assert.Error(t, err) + // Nothing should be staged. + _, statErr := os.Stat(path) + assert.True(t, os.IsNotExist(statErr)) + }) + } +} + +func TestCheckHotfix_InvalidConfigMapFailsOpen(t *testing.T) { + tt := NewTestApp(t, TestAppConfig{}) + path := filepath.Join(t.TempDir(), "hotfix.json") + tt.App.hotfixVersionPath = path + tt.App.checkHotfixConfigMapFetcher = func(context.Context) ([]byte, error) { + return []byte(`{"data":{"hotfixes.json":"not valid json"}}`), nil + } + + outcome, err := tt.App.checkHotfix(context.Background()) + assert.Equal(t, outcomeFailed, outcome) + assert.Error(t, err) + _, statErr := os.Stat(path) + assert.True(t, os.IsNotExist(statErr)) +} + +func TestCheckHotfix_ColdStartFallback(t *testing.T) { + origVersion := Version + Version = "202604.01.0" + defer func() { Version = origVersion }() + + tt := NewTestApp(t, TestAppConfig{}) + path := filepath.Join(t.TempDir(), "hotfix.json") + tt.App.hotfixVersionPath = path + + // Node config carries a lenient top-level hotfixes pointer (PoC cold-start contract). + nodeConfig := filepath.Join(t.TempDir(), "aks-node-controller-config.json") + require.NoError(t, os.WriteFile(nodeConfig, []byte( + `{"version":"v1","hotfixes":{"202604.01":"202604.01.2"}}`), 0644)) + tt.App.nodeConfigPath = nodeConfig + + tt.App.checkHotfixConfigMapFetcher = func(context.Context) ([]byte, error) { + return nil, errors.New("apiserver returned status 404") + } + + outcome, err := tt.App.checkHotfix(context.Background()) + require.NoError(t, err) + assert.Equal(t, outcomeCustomDataFallback, outcome) + + cfg := readStagedConfig(t, path) + assert.Equal(t, map[string]string{"202604.01": "202604.01.2"}, cfg.Hotfixes) +} + +func TestCheckHotfix_ColdStartNoPointerFails(t *testing.T) { + tt := NewTestApp(t, TestAppConfig{}) + path := filepath.Join(t.TempDir(), "hotfix.json") + tt.App.hotfixVersionPath = path + + nodeConfig := filepath.Join(t.TempDir(), "aks-node-controller-config.json") + require.NoError(t, os.WriteFile(nodeConfig, []byte(`{"version":"v1"}`), 0644)) + tt.App.nodeConfigPath = nodeConfig + tt.App.checkHotfixConfigMapFetcher = func(context.Context) ([]byte, error) { + return nil, errors.New("apiserver returned status 403") + } + + outcome, err := tt.App.checkHotfix(context.Background()) + assert.Equal(t, outcomeFailed, outcome) + assert.Error(t, err) + _, statErr := os.Stat(path) + assert.True(t, os.IsNotExist(statErr)) +} + +// TestRunCheckHotfixCommand_AlwaysFailOpen verifies the cli Action always returns nil +// (exit 0) and emits telemetry, regardless of the underlying outcome. +func TestRunCheckHotfixCommand_AlwaysFailOpen(t *testing.T) { + t.Run("success path emits informational event and exits 0", func(t *testing.T) { + origVersion := Version + Version = "202604.01.0" + defer func() { Version = origVersion }() + + tt := NewTestApp(t, TestAppConfig{}) + tt.App.hotfixVersionPath = filepath.Join(t.TempDir(), "hotfix.json") + tt.App.checkHotfixConfigMapFetcher = func(context.Context) ([]byte, error) { + return makeConfigMapJSON(t, map[string]string{ + hotfixConfigMapDataKey: `{"hotfixes":{"202604.01":"202604.01.1"}}`, + }), nil + } + + err := tt.App.runCheckHotfixCommand(context.Background()) + require.NoError(t, err) + + events := tt.eventLogger.Events() + require.Len(t, events, 1) + assert.Equal(t, "AKS.AKSNodeController.CheckHotfix", events[0].TaskName) + assert.Equal(t, "Informational", events[0].EventLevel) + assert.Contains(t, events[0].Message, string(outcomeConfigMapRead)) + }) + + t.Run("failure path emits error event but still exits 0", func(t *testing.T) { + tt := NewTestApp(t, TestAppConfig{}) + tt.App.hotfixVersionPath = filepath.Join(t.TempDir(), "hotfix.json") + tt.App.nodeConfigPath = filepath.Join(t.TempDir(), "nonexistent.json") + tt.App.checkHotfixConfigMapFetcher = func(context.Context) ([]byte, error) { + return nil, errors.New("apiserver returned status 500") + } + + err := tt.App.runCheckHotfixCommand(context.Background()) + require.NoError(t, err) + + events := tt.eventLogger.Events() + require.Len(t, events, 1) + assert.Equal(t, "AKS.AKSNodeController.CheckHotfix", events[0].TaskName) + assert.Equal(t, "Error", events[0].EventLevel) + assert.Contains(t, events[0].Message, string(outcomeFailed)) + }) + + t.Run("cli wiring returns exit code 0 even on fetch failure", func(t *testing.T) { + tt := NewTestApp(t, TestAppConfig{}) + tt.App.hotfixVersionPath = filepath.Join(t.TempDir(), "hotfix.json") + tt.App.nodeConfigPath = filepath.Join(t.TempDir(), "nonexistent.json") + tt.App.checkHotfixConfigMapFetcher = func(context.Context) ([]byte, error) { + return nil, errors.New("boom") + } + exitCode := tt.App.Run(context.Background(), []string{"aks-node-controller", "check-hotfix"}) + assert.Equal(t, 0, exitCode) + }) +} + +func TestCheckHotfix_DefaultsToConfigMapFetcherWhenNoInjection(t *testing.T) { + // With no injected fetcher and no reachable apiserver, the real path is exercised: it + // must fail-open. Point the node-config source at a nonexistent path so credential + // resolution fails deterministically and the network is never actually dialed. + tt := NewTestApp(t, TestAppConfig{}) + tt.App.hotfixVersionPath = filepath.Join(t.TempDir(), "hotfix.json") + tt.App.nodeConfigPath = filepath.Join(t.TempDir(), "nonexistent.json") + // checkHotfixConfigMapFetcher intentionally nil. + + err := tt.App.runCheckHotfixCommand(context.Background()) + require.NoError(t, err) +} + +func TestEnsurePort(t *testing.T) { + assert.Equal(t, "host.example.com:443", ensurePort("host.example.com", "443")) + assert.Equal(t, "host.example.com:443", ensurePort("https://host.example.com", "443")) + assert.Equal(t, "host.example.com:6443", ensurePort("host.example.com:6443", "443")) + assert.Equal(t, "host.example.com:443", ensurePort("https://host.example.com/", "443")) + assert.Equal(t, "", ensurePort("", "443")) +} + +func TestStripScheme(t *testing.T) { + assert.Equal(t, "host:443", stripScheme("https://host:443")) + assert.Equal(t, "host:443", stripScheme("http://host:443")) + assert.Equal(t, "host:443", stripScheme("host:443")) + assert.Equal(t, "host", stripScheme("https://host/")) +} + +func TestBuildAPIServerHTTPClient(t *testing.T) { + t.Run("invalid CA PEM is an error", func(t *testing.T) { + _, err := buildAPIServerHTTPClient(apiServerCreds{caPEM: []byte("not a pem")}) + require.Error(t, err) + assert.Contains(t, err.Error(), "cluster CA PEM") + }) + + t.Run("no CA builds a client with default timeout", func(t *testing.T) { + client, err := buildAPIServerHTTPClient(apiServerCreds{}) + require.NoError(t, err) + assert.Equal(t, configMapFetchTimeout, client.Timeout) + }) +} + +func TestColdStartHotfixConfig(t *testing.T) { + t.Run("missing file returns not-present without error", func(t *testing.T) { + tt := NewTestApp(t, TestAppConfig{}) + tt.App.nodeConfigPath = filepath.Join(t.TempDir(), "nope.json") + cfg, ok, err := tt.App.coldStartHotfixConfig() + require.NoError(t, err) + assert.False(t, ok) + assert.Nil(t, cfg.Hotfixes) + }) + + t.Run("present pointer is parsed", func(t *testing.T) { + tt := NewTestApp(t, TestAppConfig{}) + p := filepath.Join(t.TempDir(), "config.json") + require.NoError(t, os.WriteFile(p, []byte(`{"version":"v1","hotfixes":{"202604.01":"202604.01.5"}}`), 0644)) + tt.App.nodeConfigPath = p + cfg, ok, err := tt.App.coldStartHotfixConfig() + require.NoError(t, err) + assert.True(t, ok) + assert.Equal(t, map[string]string{"202604.01": "202604.01.5"}, cfg.Hotfixes) + }) + + t.Run("no hotfixes key returns not-present", func(t *testing.T) { + tt := NewTestApp(t, TestAppConfig{}) + p := filepath.Join(t.TempDir(), "config.json") + require.NoError(t, os.WriteFile(p, []byte(`{"version":"v1"}`), 0644)) + tt.App.nodeConfigPath = p + _, ok, err := tt.App.coldStartHotfixConfig() + require.NoError(t, err) + assert.False(t, ok) + }) +} + +func TestWriteHotfixConfig_ShapeAndAtomicity(t *testing.T) { + path := filepath.Join(t.TempDir(), "hotfix.json") + require.NoError(t, writeHotfixConfig(path, hotfixConfig{Hotfixes: map[string]string{"202604.01": "202604.01.1"}})) + + raw, err := os.ReadFile(path) + require.NoError(t, err) + // Must serialize in the {"hotfixes":{...}} shape with no legacy version field. + assert.JSONEq(t, `{"hotfixes":{"202604.01":"202604.01.1"}}`, string(raw)) + + // Round-trips through download-hotfix's reader. + cfg, err := readHotfixConfig(path) + require.NoError(t, err) + assert.Equal(t, "202604.01.1", cfg.resolveVersion("202604.01.0")) +}