fix(compression): notify users when configured aux model fails even if main-model fallback recovers (#16775)

A misconfigured auxiliary.compression.model is a user-fixable problem that silent recovery would hide. The previous retry-on-main logic transparently swallowed aux-model failures whenever the fallback succeeded, leaving the user's broken config in place and racking up future failures. Track the aux-model failure on the compressor alongside the existing fallback-placeholder fields: - _last_aux_model_failure_model: str | None - _last_aux_model_failure_error: str | None Both are set at the moment the aux model errors (captured before summary_model is cleared for retry), regardless of whether the retry succeeds. Cleared at compress() start and on on_session_reset() so a clean run doesn't leak stale warnings. Surface at three places: - gateway hygiene auto-compress: ℹ note to the platform adapter (thread_id preserved) - gateway /compress command: ℹ line appended to the reply - CLI via _emit_warning: deduped on (model, error) so repeat compactions don't spam Distinct from the existing ⚠️ dropped-turns warning — different severity, different emoji, explicit 'context is intact' reassurance.
2026-04-27 20:08:23 -07:00
parent c3e3a9c184
commit 6ea5699e3f
6 changed files with 367 additions and 1 deletions
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@@ -340,6 +340,8 @@ class ContextCompressor(ContextEngine):
        self._last_summary_error = None
        self._last_summary_dropped_count = 0
        self._last_summary_fallback_used = False
+        self._last_aux_model_failure_error = None
+        self._last_aux_model_failure_model = None
        self._last_compression_savings_pct = 100.0
        self._ineffective_compression_count = 0

@@ -448,6 +450,12 @@ class ContextCompressor(ContextEngine):
        # (gateway hygiene, /compress) can surface a visible warning.
        self._last_summary_dropped_count: int = 0
        self._last_summary_fallback_used: bool = False
+        # When a user-configured summary model fails and we recover by
+        # retrying on the main model, record the failure so gateway /
+        # CLI callers can still warn the user even though compression
+        # succeeded.  Silent recovery would hide the broken config.
+        self._last_aux_model_failure_error: Optional[str] = None
+        self._last_aux_model_failure_model: Optional[str] = None

    def update_from_response(self, usage: Dict[str, Any]):
        """Update tracked token usage from API response."""
@@ -907,6 +915,14 @@ The user has requested that this compaction PRIORITISE preserving all informatio
                    "Falling back to main model '%s' for compression.",
                    self.summary_model, e, self.model,
                )
+                # Record the aux-model failure so callers can warn the user
+                # even if the retry-on-main succeeds — a misconfigured aux
+                # model is something the user needs to fix.
+                _err_text = str(e).strip() or e.__class__.__name__
+                if len(_err_text) > 220:
+                    _err_text = _err_text[:217].rstrip() + "..."
+                self._last_aux_model_failure_error = _err_text
+                self._last_aux_model_failure_model = self.summary_model
                self.summary_model = ""  # empty = use main model
                self._summary_failure_cooldown_until = 0.0  # no cooldown
                return self._generate_summary(turns_to_summarize, focus_topic=focus_topic)  # retry immediately
@@ -931,6 +947,14 @@ The user has requested that this compaction PRIORITISE preserving all informatio
                    "Retrying on main model '%s' before giving up.",
                    self.summary_model, e, self.model,
                )
+                # Record the aux-model failure (see 404 branch above) — user
+                # should know their configured model is broken even if main
+                # recovers the call.
+                _err_text = str(e).strip() or e.__class__.__name__
+                if len(_err_text) > 220:
+                    _err_text = _err_text[:217].rstrip() + "..."
+                self._last_aux_model_failure_error = _err_text
+                self._last_aux_model_failure_model = self.summary_model
                self.summary_model = ""  # empty = use main model
                self._summary_failure_cooldown_until = 0.0
                return self._generate_summary(turns_to_summarize, focus_topic=focus_topic)
@@ -1232,6 +1256,8 @@ The user has requested that this compaction PRIORITISE preserving all informatio
        self._last_summary_dropped_count = 0
        self._last_summary_fallback_used = False
        self._last_summary_error = None
+        self._last_aux_model_failure_error = None
+        self._last_aux_model_failure_model = None
        n_messages = len(messages)
        # Only need head + 3 tail messages minimum (token budget decides the real tail size)
        _min_for_compress = self.protect_first_n + 3 + 1
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -4828,6 +4828,30 @@ class GatewayRunner:
                                                "Failed to deliver compression-failure warning to user: %s",
                                                _werr,
                                            )
+                                    # Separately: if the user's CONFIGURED aux
+                                    # model failed and we recovered by falling
+                                    # back to the main model, tell them — a
+                                    # misconfigured auxiliary.compression.model
+                                    # is something only they can fix, and
+                                    # silent recovery would hide it.
+                                    elif _comp is not None and getattr(_comp, "_last_aux_model_failure_model", None):
+                                        _aux_model = getattr(_comp, "_last_aux_model_failure_model", "")
+                                        _aux_err = getattr(_comp, "_last_aux_model_failure_error", None) or "unknown error"
+                                        _aux_msg = (
+                                            f"ℹ️ Configured compression model `{_aux_model}` "
+                                            f"failed ({_aux_err}). Recovered using your main "
+                                            "model — context is intact — but you may want to "
+                                            "check `auxiliary.compression.model` in config.yaml."
+                                        )
+                                        try:
+                                            _adapter = self.adapters.get(source.platform)
+                                            if _adapter and source.chat_id:
+                                                await _adapter.send(source.chat_id, _aux_msg, metadata=_hyg_meta)
+                                        except Exception as _werr:
+                                            logger.warning(
+                                                "Failed to deliver aux-model-fallback notice to user: %s",
+                                                _werr,
+                                            )
                                finally:
                                    self._cleanup_agent_resources(_hyg_agent)

@@ -7377,6 +7401,11 @@ class GatewayRunner:
                _summary_failed = bool(getattr(compressor, "_last_summary_fallback_used", False))
                _dropped_count = int(getattr(compressor, "_last_summary_dropped_count", 0) or 0)
                _summary_err = getattr(compressor, "_last_summary_error", None)
+                # Separately: did the user's CONFIGURED aux model fail
+                # and we recovered via main?  Surface that as an info
+                # note so they can fix their config.
+                _aux_fail_model = getattr(compressor, "_last_aux_model_failure_model", None)
+                _aux_fail_err = getattr(compressor, "_last_aux_model_failure_error", None)
            finally:
                self._cleanup_agent_resources(tmp_agent)
            lines = [f"🗜️ {summary['headline']}"]
@@ -7392,6 +7421,13 @@ class GatewayRunner:
                    "with a placeholder; earlier context is no longer recoverable. "
                    "Consider checking your auxiliary.compression model configuration."
                )
+            elif _aux_fail_model:
+                lines.append(
+                    f"ℹ️ Configured compression model `{_aux_fail_model}` failed "
+                    f"({_aux_fail_err or 'unknown error'}). Recovered using your main "
+                    "model — context is intact — but you may want to check "
+                    "`auxiliary.compression.model` in config.yaml."
+                )
            return "\n".join(lines)
        except Exception as e:
            logger.warning("Manual compress failed: %s", e)
--- a/run_agent.py
+++ b/run_agent.py
@@ -8460,6 +8460,23 @@ class AIAgent:
                    f"⚠ Compression summary failed: {summary_error}. "
                    "Inserted a fallback context marker."
                )
+        else:
+            # No hard failure — but did the configured aux model error out
+            # and get recovered by retrying on main?  Surface that so users
+            # know their auxiliary.compression.model setting is broken even
+            # though compression succeeded.
+            _aux_fail_model = getattr(self.context_compressor, "_last_aux_model_failure_model", None)
+            _aux_fail_err = getattr(self.context_compressor, "_last_aux_model_failure_error", None)
+            if _aux_fail_model:
+                # Dedup on (model, error) so we don't spam on every compaction
+                _aux_key = (_aux_fail_model, _aux_fail_err)
+                if getattr(self, "_last_aux_fallback_warning_key", None) != _aux_key:
+                    self._last_aux_fallback_warning_key = _aux_key
+                    self._emit_warning(
+                        f"ℹ Configured compression model '{_aux_fail_model}' failed "
+                        f"({_aux_fail_err or 'unknown error'}). Recovered using main model — "
+                        "check auxiliary.compression.model in config.yaml."
+                    )

        todo_snapshot = self._todo_store.format_for_injection()
        if todo_snapshot:
--- a/tests/agent/test_context_compressor.py
+++ b/tests/agent/test_context_compressor.py
@@ -285,6 +285,12 @@ class TestSummaryFallbackToMainModel:
        assert "model" not in mock_call.call_args_list[1].kwargs
        assert result is not None
        assert "summary via main model" in result
+        # Aux-model failure is recorded even though retry succeeded — this is
+        # how callers (gateway /compress, CLI warning) know to tell the user
+        # their auxiliary.compression.model setting is broken.
+        assert c._last_aux_model_failure_model == "broken-aux-model"
+        assert c._last_aux_model_failure_error is not None
+        assert "404" in c._last_aux_model_failure_error

    def test_unknown_error_falls_back_to_main_and_succeeds(self):
        """Errors that don't match the 404/503/model_not_found fast-path
@@ -317,6 +323,10 @@ class TestSummaryFallbackToMainModel:
        assert "model" not in mock_call.call_args_list[1].kwargs
        assert result is not None
        assert "summary via main model" in result
+        # Aux-model failure recorded despite successful recovery
+        assert c._last_aux_model_failure_model == "broken-aux-model"
+        assert c._last_aux_model_failure_error is not None
+        assert "400" in c._last_aux_model_failure_error

    def test_no_fallback_when_summary_model_equals_main_model(self):
        """If the aux model IS the main model, there's nowhere to fall back
@@ -367,6 +377,97 @@ class TestSummaryFallbackToMainModel:
        assert c._summary_model_fallen_back is True


+class TestAuxModelFallbackSurfacedToCallers:
+    """When summary_model fails but retry-on-main succeeds, compress() must
+    expose the aux-model failure via _last_aux_model_failure_{model,error}
+    so gateway /compress and CLI callers can warn the user about their
+    broken auxiliary.compression.model config — silent recovery would hide
+    a misconfiguration only the user can fix."""
+
+    def _make_msgs(self):
+        return [
+            {"role": "system", "content": "sys"},
+            {"role": "user", "content": "msg 1"},
+            {"role": "assistant", "content": "msg 2"},
+            {"role": "user", "content": "msg 3"},
+            {"role": "assistant", "content": "msg 4"},
+            {"role": "user", "content": "msg 5"},
+            {"role": "assistant", "content": "msg 6"},
+            {"role": "user", "content": "msg 7"},
+        ]
+
+    def test_compress_exposes_aux_failure_fields_after_successful_fallback(self):
+        mock_ok = MagicMock()
+        mock_ok.choices = [MagicMock()]
+        mock_ok.choices[0].message.content = "summary via main"
+        err_400 = Exception("400 provider rejected configured model")
+        err_400.status_code = 400
+
+        with patch("agent.context_compressor.get_model_context_length", return_value=100000):
+            c = ContextCompressor(
+                model="main-model",
+                summary_model_override="broken-aux-model",
+                quiet_mode=True,
+                protect_first_n=2,
+                protect_last_n=2,
+            )
+
+        with patch(
+            "agent.context_compressor.call_llm",
+            side_effect=[err_400, mock_ok],
+        ):
+            result = c.compress(self._make_msgs())
+
+        # Recovery succeeded → no fallback placeholder
+        assert c._last_summary_fallback_used is False
+        # But aux-model failure IS recorded for the gateway/CLI warning
+        assert c._last_aux_model_failure_model == "broken-aux-model"
+        assert c._last_aux_model_failure_error is not None
+        assert "400" in c._last_aux_model_failure_error
+        # Result is well-formed with a real summary, not a placeholder
+        assert any(
+            isinstance(m.get("content"), str) and "summary via main" in m["content"]
+            for m in result
+        )
+
+    def test_compress_clears_aux_failure_fields_at_start_of_next_call(self):
+        """A subsequent successful compression must clear the aux-failure
+        fields so the warning doesn't persist forever."""
+        mock_ok = MagicMock()
+        mock_ok.choices = [MagicMock()]
+        mock_ok.choices[0].message.content = "summary via main"
+        err_400 = Exception("400 aux model busted")
+        err_400.status_code = 400
+
+        with patch("agent.context_compressor.get_model_context_length", return_value=100000):
+            c = ContextCompressor(
+                model="main-model",
+                summary_model_override="broken-aux-model",
+                quiet_mode=True,
+                protect_first_n=2,
+                protect_last_n=2,
+            )
+
+        # Call 1: aux fails, retry-on-main succeeds
+        with patch(
+            "agent.context_compressor.call_llm",
+            side_effect=[err_400, mock_ok],
+        ):
+            c.compress(self._make_msgs())
+        assert c._last_aux_model_failure_model == "broken-aux-model"
+
+        # Call 2: clean run on main (summary_model was cleared to "" after
+        # first fallback).  Aux-failure fields MUST reset at compress() start
+        # so the old warning state doesn't leak into this call.
+        with patch(
+            "agent.context_compressor.call_llm",
+            return_value=mock_ok,
+        ):
+            c.compress(self._make_msgs())
+        assert c._last_aux_model_failure_model is None
+        assert c._last_aux_model_failure_error is None
+
+
 class TestSummaryFailureTrackingForGatewayWarning:
    """When summary generation fails, the compressor must record dropped count
    + fallback flag so gateway hygiene & /compress can surface a visible
--- a/tests/gateway/test_compress_command.py
+++ b/tests/gateway/test_compress_command.py
@@ -181,3 +181,65 @@ async def test_compress_command_appends_warning_when_summary_generation_fails():
    assert "historical message(s) were removed" in result
    agent_instance.shutdown_memory_provider.assert_called_once()
    agent_instance.close.assert_called_once()
+
+
+@pytest.mark.asyncio
+async def test_compress_command_surfaces_aux_model_failure_even_when_recovered():
+    """When the user's configured ``auxiliary.compression.model`` errors out
+    but compression recovers by retrying on the main model, /compress must
+    STILL inform the user.  Silent recovery hides broken config the user
+    needs to fix."""
+    history = _make_history()
+    # Compressed transcript — normal successful compression, no placeholder.
+    compressed = [
+        history[0],
+        {"role": "assistant", "content": "summary via main model"},
+        history[-1],
+    ]
+    runner = _make_runner(history)
+    agent_instance = MagicMock()
+    agent_instance.shutdown_memory_provider = MagicMock()
+    agent_instance.close = MagicMock()
+    agent_instance.context_compressor.has_content_to_compress.return_value = True
+    # Fallback placeholder was NOT used — recovery succeeded.
+    agent_instance.context_compressor._last_summary_fallback_used = False
+    agent_instance.context_compressor._last_summary_dropped_count = 0
+    agent_instance.context_compressor._last_summary_error = None
+    # But the configured aux model DID fail before the retry succeeded.
+    agent_instance.context_compressor._last_aux_model_failure_model = (
+        "gemini-3-flash-preview"
+    )
+    agent_instance.context_compressor._last_aux_model_failure_error = (
+        "404 model not found: gemini-3-flash-preview"
+    )
+    agent_instance.session_id = "sess-1"
+    agent_instance._compress_context.return_value = (compressed, "")
+
+    def _estimate(messages):
+        if messages == history:
+            return 100
+        if messages == compressed:
+            return 60
+        raise AssertionError(f"unexpected transcript: {messages!r}")
+
+    with (
+        patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "***"}),
+        patch("gateway.run._resolve_gateway_model", return_value="test-model"),
+        patch("run_agent.AIAgent", return_value=agent_instance),
+        patch("agent.model_metadata.estimate_messages_tokens_rough", side_effect=_estimate),
+    ):
+        result = await runner._handle_compress_command(_make_event())
+
+    # Compression succeeded
+    assert "Compressed:" in result
+    # No ⚠️ warning (that's reserved for dropped-turns case)
+    assert "⚠️" not in result
+    # But there IS an info note about the broken aux model
+    assert "ℹ️" in result
+    assert "gemini-3-flash-preview" in result
+    assert "404" in result
+    assert "auxiliary.compression.model" in result
+    # The user's context is explicitly called out as intact
+    assert "intact" in result
+    agent_instance.shutdown_memory_provider.assert_called_once()
+    agent_instance.close.assert_called_once()
--- a/tests/gateway/test_session_hygiene.py
+++ b/tests/gateway/test_session_hygiene.py
@@ -508,4 +508,128 @@ async def test_session_hygiene_warns_user_when_summary_generation_fails(monkeypa
    assert warn["chat_id"] == "-1001"
    assert warn["metadata"] == {"thread_id": "17585"}

-    FakeCompressAgentWithSummaryFailure.last_instance.close.assert_called_once()
+    FakeCompressAgentWithSummaryFailure.last_instance.close.assert_called_once()
+
+
+@pytest.mark.asyncio
+async def test_session_hygiene_informs_user_when_aux_model_fails_but_recovers(monkeypatch, tmp_path):
+    """When the user's configured ``auxiliary.compression.model`` errors out
+    and we recover via the main model, compression succeeds but the user's
+    config is still broken.  Gateway hygiene must surface an ℹ note so the
+    user knows to fix ``auxiliary.compression.model`` — silent recovery
+    hides a misconfig only they can resolve."""
+    fake_dotenv = types.ModuleType("dotenv")
+    fake_dotenv.load_dotenv = lambda *args, **kwargs: None
+    monkeypatch.setitem(sys.modules, "dotenv", fake_dotenv)
+
+    class FakeCompressAgentWithAuxRecovery:
+        last_instance = None
+
+        def __init__(self, **kwargs):
+            self.model = kwargs.get("model")
+            self.session_id = kwargs.get("session_id", "fake-session")
+            self._print_fn = None
+            self.shutdown_memory_provider = MagicMock()
+            self.close = MagicMock()
+            # Compression succeeded (no placeholder inserted) but the
+            # configured aux model errored and we fell back to main.
+            self.context_compressor = SimpleNamespace(
+                _last_summary_fallback_used=False,
+                _last_summary_dropped_count=0,
+                _last_summary_error=None,
+                _last_aux_model_failure_model="gemini-3-flash-preview",
+                _last_aux_model_failure_error="404 model not found",
+            )
+            type(self).last_instance = self
+
+        def _compress_context(self, messages, *_args, **_kwargs):
+            self.session_id = f"{self.session_id}_compressed"
+            return ([{"role": "assistant", "content": "real summary"}], None)
+
+    fake_run_agent = types.ModuleType("run_agent")
+    fake_run_agent.AIAgent = FakeCompressAgentWithAuxRecovery
+    monkeypatch.setitem(sys.modules, "run_agent", fake_run_agent)
+
+    gateway_run = importlib.import_module("gateway.run")
+    GatewayRunner = gateway_run.GatewayRunner
+
+    adapter = HygieneCaptureAdapter()
+    runner = object.__new__(GatewayRunner)
+    runner.config = GatewayConfig(
+        platforms={Platform.TELEGRAM: PlatformConfig(enabled=True, token="fake-token")}
+    )
+    runner.adapters = {Platform.TELEGRAM: adapter}
+    runner._voice_mode = {}
+    runner.hooks = SimpleNamespace(emit=AsyncMock(), loaded_hooks=False)
+    runner.session_store = MagicMock()
+    runner.session_store.get_or_create_session.return_value = SessionEntry(
+        session_key="agent:main:telegram:group:-1001:17585",
+        session_id="sess-1",
+        created_at=datetime.now(),
+        updated_at=datetime.now(),
+        platform=Platform.TELEGRAM,
+        chat_type="group",
+    )
+    runner.session_store.load_transcript.return_value = _make_history(6, content_size=400)
+    runner.session_store.has_any_sessions.return_value = True
+    runner.session_store.rewrite_transcript = MagicMock()
+    runner.session_store.append_to_transcript = MagicMock()
+    runner._running_agents = {}
+    runner._pending_messages = {}
+    runner._pending_approvals = {}
+    runner._session_db = None
+    runner._is_user_authorized = lambda _source: True
+    runner._set_session_env = lambda _context: None
+    runner._run_agent = AsyncMock(
+        return_value={
+            "final_response": "ok",
+            "messages": [],
+            "tools": [],
+            "history_offset": 0,
+            "last_prompt_tokens": 0,
+        }
+    )
+
+    monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
+    monkeypatch.setattr(gateway_run, "_resolve_runtime_agent_kwargs", lambda: {"api_key": "***"})
+    monkeypatch.setattr(
+        "agent.model_metadata.get_model_context_length",
+        lambda *_args, **_kwargs: 100,
+    )
+    monkeypatch.setenv("TELEGRAM_HOME_CHANNEL", "795544298")
+
+    event = MessageEvent(
+        text="hello",
+        source=SessionSource(
+            platform=Platform.TELEGRAM,
+            chat_id="-1001",
+            chat_type="group",
+            thread_id="17585",
+            user_id="12345",
+        ),
+        message_id="1",
+    )
+
+    result = await runner._handle_message(event)
+
+    assert result == "ok"
+    # No ⚠️ hard-failure warning (that's for dropped turns)
+    hard_warnings = [s for s in adapter.sent if "Context compression summary failed" in s["content"]]
+    assert len(hard_warnings) == 0, adapter.sent
+    # But an ℹ note about the configured aux model must be delivered.
+    aux_notes = [
+        s for s in adapter.sent
+        if "Configured compression model" in s["content"]
+    ]
+    assert len(aux_notes) == 1, (
+        f"Expected 1 aux-model fallback notice, got {len(aux_notes)}: {adapter.sent}"
+    )
+    note = aux_notes[0]
+    assert "gemini-3-flash-preview" in note["content"]
+    assert "404" in note["content"]
+    assert "auxiliary.compression.model" in note["content"]
+    # Note must land in the originating topic/thread.
+    assert note["chat_id"] == "-1001"
+    assert note["metadata"] == {"thread_id": "17585"}
+
+    FakeCompressAgentWithAuxRecovery.last_instance.close.assert_called_once()