tag1consulting · jeremyandrews · Jun 14, 2026 · Jun 14, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,9 @@ All notable changes to scolta-python are documented here.
 
 ## [Unreleased]
 
+### Fixed
+- **Auto-provisioned Amazee credentials stored without resolved model names no longer leave AI permanently broken (`src/scolta/ai/amazee/auto_provisioner.py`).** Provisioning persists credentials and resolves model names as two non-atomic steps (`AmazeeTrialProvisioner.provision()` stores the token+url, then calls `/model/info`). When the model-info call fails, `get_available_models()` swallows the error and returns `[]`, so the `on_models_resolved` gate never fires and no model name is persisted — but `ConfigStorage.load()` requires only token+url, so it reports the half-provisioned credentials as valid. `ensure_ai_available()` then short-circuited on stored credentials on every later request and never re-resolved, so the caller fell back to the dated config default (`claude-sonnet-4-5-20250929`) which the Amazee LiteLLM gateway rejects with HTTP 400 "Invalid model name" — failing AI silently with no self-recovery (outside `KeyExpiryRecovery`'s auth-only remit). `ensure_ai_available()` now accepts an optional `has_resolved_models` predicate: when stored credentials exist but the caller reports models are still unresolved, model resolution is re-attempted against the **already-stored key** (never a fresh trial, which would waste a server-limited allocation) and `on_models_resolved` fires with the result, so the incomplete-provision state self-heals on the next lazy-init pass. Without the predicate the historical no-op is unchanged. A regression test drives the full provision → failed-resolution → store → re-resolve sequence. (The dated-default fallback itself lives in the consuming adapter/demo client construction, which adopts the predicate when it re-vendors.)
+
 ### Added
 - **CI now builds and validates the PyPI artifacts (`dist` job in
   `ci.yml`).** Publishing is manual and nothing in CI built the sdist/wheel, so

diff --git a/src/scolta/ai/amazee/auto_provisioner.py b/src/scolta/ai/amazee/auto_provisioner.py
@@ -19,21 +19,50 @@ def ensure_ai_available(
         has_explicit_api_key: bool = False,
         on_models_resolved: Callable[[str, str], None] | None = None,
         client: AmazeeClient | None = None,
+        has_resolved_models: Callable[[], bool] | None = None,
     ) -> bool:
         """Provision a free trial unless AI is already configured. Idempotent;
         no-op when an explicit key exists or credentials are already stored.
-        Returns True only on a successful first provisioning.
+        Returns True only when a fresh trial was provisioned.
 
         The stored-credentials no-op deliberately does NOT validate that the
         stored key still works — trial keys are revoked server-side when the
         trial ends, and that expiry is not announced at provisioning time, so a
         cheap install-hook/lazy-init guard cannot know. Call-time auth failures
         are the reliable signal: :class:`KeyExpiryRecovery` detects them and
         recovers through :meth:`reprovision`, which bypasses this no-op.
+
+        Stored credentials are treated as a *complete* provision only once their
+        model names are resolved. A provision whose ``/model/info`` call failed
+        stores the token+url with no models, leaving the caller to fall back to
+        the dated config default — which the Amazee gateway rejects with HTTP
+        400, breaking AI permanently because this guard kept no-opping on the
+        half-provisioned credentials. When the caller can confirm models are
+        still unresolved (via ``has_resolved_models``), model resolution is
+        re-attempted against the ALREADY-STORED key — never a fresh trial, which
+        would waste a server-side-limited allocation — so the incomplete-provision
+        state self-heals. Without that callback the historical no-op stands: the
+        caller cannot tell us, and we must not re-resolve blindly every request.
         """
         if has_explicit_api_key:
             return False
-        if storage.load() is not None:
+
+        credentials = storage.load()
+        if credentials is not None:
+            # Already provisioned. Self-heal only an incomplete provision — one
+            # whose model resolution failed, leaving credentials with no models
+            # — and only when the caller can confirm that state. Re-resolve
+            # against the stored key (not a new trial) and persist the result.
+            if has_resolved_models is None or has_resolved_models():
+                return False
+
+            models = AmazeeModelResolver(client or AmazeeClient()).resolve(
+                credentials["litellm_api_url"], credentials["litellm_token"]
+            )
+            if on_models_resolved is not None and (
+                models["ai_model"] is not None or models["ai_expansion_model"] is not None
+            ):
+                on_models_resolved(models["ai_model"] or "", models["ai_expansion_model"] or "")
             return False
 
         amazee_client = client or AmazeeClient()

diff --git a/tests/ai/amazee/test_amazee.py b/tests/ai/amazee/test_amazee.py
@@ -385,6 +385,105 @@ def test_reprovision_returns_false_on_api_error():
     assert storage.load() is None
 
 
+# -- auto provisioner: self-heal of an incomplete provision -------------------
+# A provision whose /model/info call failed stores token+url with no resolved
+# models. ensure_ai_available() used to no-op on those forever, so the caller
+# fell back to the dated config default the Amazee gateway rejects with HTTP 400
+# and AI broke permanently. Re-resolving against the STORED key (never a fresh
+# trial) heals it.
+
+
+def test_auto_provisioner_self_heals_half_provisioned_state():
+    # Exercise the real bug sequence end to end: a provision whose /model/info
+    # returns no models, then a later pass that re-resolves once models are
+    # reachable.
+    state = {"model_info_empty": True, "trial_calls": 0}
+
+    def handler(request):
+        if request.url.path == "/auth/generate-trial-access":
+            state["trial_calls"] += 1
+            return httpx.Response(
+                200,
+                json={"litellm_token": "tok", "litellm_api_url": "https://llm.x", "region": "us"},
+            )
+        if request.url.path == "/model/info":
+            data = (
+                []
+                if state["model_info_empty"]
+                else [{"model_name": "claude-sonnet-4-6"}, {"model_name": "claude-haiku-4-5"}]
+            )
+            return httpx.Response(200, json={"data": data})
+        return httpx.Response(404)
+
+    client = AmazeeClient(http_client=httpx.Client(transport=httpx.MockTransport(handler)))
+    storage = MemoryStorage()
+    resolved = []
+
+    # Pass 1: trial provisioning succeeds; /model/info returns no models.
+    provisioned = AutoProvisioner.ensure_ai_available(
+        storage,
+        on_models_resolved=lambda m, e: resolved.append((m, e)),
+        client=client,
+        has_resolved_models=lambda: False,
+    )
+    assert provisioned is True  # a fresh trial WAS provisioned
+    assert storage.load()["litellm_token"] == "tok"
+    assert resolved == []  # but models stayed unresolved — the gap
+
+    # Pass 2: credentials present, models still unresolved → self-heal by
+    # re-resolving against the stored key. No second trial is provisioned.
+    state["model_info_empty"] = False
+    healed = AutoProvisioner.ensure_ai_available(
+        storage,
+        on_models_resolved=lambda m, e: resolved.append((m, e)),
+        client=client,
+        has_resolved_models=lambda: False,
+    )
+    assert healed is False  # a model-only heal, not a new provision
+    assert resolved == [("claude-sonnet-4-6", "claude-haiku-4-5")]
+    assert state["trial_calls"] == 1  # never burned a second trial
+    # The resolved model is a real undated alias, never the dated default the
+    # gateway rejects.
+    assert resolved[0][0] != "claude-sonnet-4-5-20250929"
+
+
+def test_auto_provisioner_does_not_reresolve_when_models_resolved():
+    # Fully provisioned: the predicate reports models present, so /model/info is
+    # never queried (re-resolving every request is wasteful).
+    def handler(request):
+        raise AssertionError(f"no HTTP call expected, got {request.url.path}")
+
+    client = AmazeeClient(http_client=httpx.Client(transport=httpx.MockTransport(handler)))
+    storage = MemoryStorage()
+    storage.store("tok", "https://llm.x", "us")
+    called = []
+    result = AutoProvisioner.ensure_ai_available(
+        storage,
+        on_models_resolved=lambda m, e: called.append((m, e)),
+        client=client,
+        has_resolved_models=lambda: True,
+    )
+    assert result is False
+    assert called == []
+
+
+def test_auto_provisioner_stored_creds_without_predicate_stay_noop():
+    # Back-compat: a caller that does not pass has_resolved_models keeps the
+    # historical "stored credentials are complete" no-op (no HTTP call).
+    def handler(request):
+        raise AssertionError(f"no HTTP call expected, got {request.url.path}")
+
+    client = AmazeeClient(http_client=httpx.Client(transport=httpx.MockTransport(handler)))
+    storage = MemoryStorage()
+    storage.store("tok", "https://llm.x", "us")
+    called = []
+    result = AutoProvisioner.ensure_ai_available(
+        storage, on_models_resolved=lambda m, e: called.append((m, e)), client=client
+    )
+    assert result is False
+    assert called == []
+
+
 # -- budget decorator ---------------------------------------------------------