fix(compression): notify users when configured aux model fails even if main-model fallback recovers (#16775)

A misconfigured auxiliary.compression.model is a user-fixable problem that silent recovery would hide. The previous retry-on-main logic transparently swallowed aux-model failures whenever the fallback succeeded, leaving the user's broken config in place and racking up future failures.

Track the aux-model failure on the compressor alongside the existing fallback-placeholder fields:
- _last_aux_model_failure_model: str | None
- _last_aux_model_failure_error: str | None

Both are set at the moment the aux model errors (captured before summary_model is cleared for retry), regardless of whether the retry succeeds. Cleared at compress() start and on on_session_reset() so a clean run doesn't leak stale warnings.

Surface at three places:
- gateway hygiene auto-compress: ℹ note to the platform adapter (thread_id preserved)
- gateway /compress command: ℹ line appended to the reply
- CLI via _emit_warning: deduped on (model, error) so repeat compactions don't spam

Distinct from the existing ⚠️ dropped-turns warning — different severity, different emoji, explicit 'context is intact' reassurance.
This commit is contained in:
Teknium
2026-04-27 20:08:23 -07:00
committed by GitHub
parent c3e3a9c184
commit 6ea5699e3f
6 changed files with 367 additions and 1 deletions

View File

@@ -340,6 +340,8 @@ class ContextCompressor(ContextEngine):
self._last_summary_error = None
self._last_summary_dropped_count = 0
self._last_summary_fallback_used = False
self._last_aux_model_failure_error = None
self._last_aux_model_failure_model = None
self._last_compression_savings_pct = 100.0
self._ineffective_compression_count = 0
@@ -448,6 +450,12 @@ class ContextCompressor(ContextEngine):
# (gateway hygiene, /compress) can surface a visible warning.
self._last_summary_dropped_count: int = 0
self._last_summary_fallback_used: bool = False
# When a user-configured summary model fails and we recover by
# retrying on the main model, record the failure so gateway /
# CLI callers can still warn the user even though compression
# succeeded. Silent recovery would hide the broken config.
self._last_aux_model_failure_error: Optional[str] = None
self._last_aux_model_failure_model: Optional[str] = None
def update_from_response(self, usage: Dict[str, Any]):
"""Update tracked token usage from API response."""
@@ -907,6 +915,14 @@ The user has requested that this compaction PRIORITISE preserving all informatio
"Falling back to main model '%s' for compression.",
self.summary_model, e, self.model,
)
# Record the aux-model failure so callers can warn the user
# even if the retry-on-main succeeds — a misconfigured aux
# model is something the user needs to fix.
_err_text = str(e).strip() or e.__class__.__name__
if len(_err_text) > 220:
_err_text = _err_text[:217].rstrip() + "..."
self._last_aux_model_failure_error = _err_text
self._last_aux_model_failure_model = self.summary_model
self.summary_model = "" # empty = use main model
self._summary_failure_cooldown_until = 0.0 # no cooldown
return self._generate_summary(turns_to_summarize, focus_topic=focus_topic) # retry immediately
@@ -931,6 +947,14 @@ The user has requested that this compaction PRIORITISE preserving all informatio
"Retrying on main model '%s' before giving up.",
self.summary_model, e, self.model,
)
# Record the aux-model failure (see 404 branch above) — user
# should know their configured model is broken even if main
# recovers the call.
_err_text = str(e).strip() or e.__class__.__name__
if len(_err_text) > 220:
_err_text = _err_text[:217].rstrip() + "..."
self._last_aux_model_failure_error = _err_text
self._last_aux_model_failure_model = self.summary_model
self.summary_model = "" # empty = use main model
self._summary_failure_cooldown_until = 0.0
return self._generate_summary(turns_to_summarize, focus_topic=focus_topic)
@@ -1232,6 +1256,8 @@ The user has requested that this compaction PRIORITISE preserving all informatio
self._last_summary_dropped_count = 0
self._last_summary_fallback_used = False
self._last_summary_error = None
self._last_aux_model_failure_error = None
self._last_aux_model_failure_model = None
n_messages = len(messages)
# Only need head + 3 tail messages minimum (token budget decides the real tail size)
_min_for_compress = self.protect_first_n + 3 + 1

View File

@@ -4828,6 +4828,30 @@ class GatewayRunner:
"Failed to deliver compression-failure warning to user: %s",
_werr,
)
# Separately: if the user's CONFIGURED aux
# model failed and we recovered by falling
# back to the main model, tell them — a
# misconfigured auxiliary.compression.model
# is something only they can fix, and
# silent recovery would hide it.
elif _comp is not None and getattr(_comp, "_last_aux_model_failure_model", None):
_aux_model = getattr(_comp, "_last_aux_model_failure_model", "")
_aux_err = getattr(_comp, "_last_aux_model_failure_error", None) or "unknown error"
_aux_msg = (
f" Configured compression model `{_aux_model}` "
f"failed ({_aux_err}). Recovered using your main "
"model — context is intact — but you may want to "
"check `auxiliary.compression.model` in config.yaml."
)
try:
_adapter = self.adapters.get(source.platform)
if _adapter and source.chat_id:
await _adapter.send(source.chat_id, _aux_msg, metadata=_hyg_meta)
except Exception as _werr:
logger.warning(
"Failed to deliver aux-model-fallback notice to user: %s",
_werr,
)
finally:
self._cleanup_agent_resources(_hyg_agent)
@@ -7377,6 +7401,11 @@ class GatewayRunner:
_summary_failed = bool(getattr(compressor, "_last_summary_fallback_used", False))
_dropped_count = int(getattr(compressor, "_last_summary_dropped_count", 0) or 0)
_summary_err = getattr(compressor, "_last_summary_error", None)
# Separately: did the user's CONFIGURED aux model fail
# and we recovered via main? Surface that as an info
# note so they can fix their config.
_aux_fail_model = getattr(compressor, "_last_aux_model_failure_model", None)
_aux_fail_err = getattr(compressor, "_last_aux_model_failure_error", None)
finally:
self._cleanup_agent_resources(tmp_agent)
lines = [f"🗜️ {summary['headline']}"]
@@ -7392,6 +7421,13 @@ class GatewayRunner:
"with a placeholder; earlier context is no longer recoverable. "
"Consider checking your auxiliary.compression model configuration."
)
elif _aux_fail_model:
lines.append(
f" Configured compression model `{_aux_fail_model}` failed "
f"({_aux_fail_err or 'unknown error'}). Recovered using your main "
"model — context is intact — but you may want to check "
"`auxiliary.compression.model` in config.yaml."
)
return "\n".join(lines)
except Exception as e:
logger.warning("Manual compress failed: %s", e)

View File

@@ -8460,6 +8460,23 @@ class AIAgent:
f"⚠ Compression summary failed: {summary_error}. "
"Inserted a fallback context marker."
)
else:
# No hard failure — but did the configured aux model error out
# and get recovered by retrying on main? Surface that so users
# know their auxiliary.compression.model setting is broken even
# though compression succeeded.
_aux_fail_model = getattr(self.context_compressor, "_last_aux_model_failure_model", None)
_aux_fail_err = getattr(self.context_compressor, "_last_aux_model_failure_error", None)
if _aux_fail_model:
# Dedup on (model, error) so we don't spam on every compaction
_aux_key = (_aux_fail_model, _aux_fail_err)
if getattr(self, "_last_aux_fallback_warning_key", None) != _aux_key:
self._last_aux_fallback_warning_key = _aux_key
self._emit_warning(
f" Configured compression model '{_aux_fail_model}' failed "
f"({_aux_fail_err or 'unknown error'}). Recovered using main model — "
"check auxiliary.compression.model in config.yaml."
)
todo_snapshot = self._todo_store.format_for_injection()
if todo_snapshot:

View File

@@ -285,6 +285,12 @@ class TestSummaryFallbackToMainModel:
assert "model" not in mock_call.call_args_list[1].kwargs
assert result is not None
assert "summary via main model" in result
# Aux-model failure is recorded even though retry succeeded — this is
# how callers (gateway /compress, CLI warning) know to tell the user
# their auxiliary.compression.model setting is broken.
assert c._last_aux_model_failure_model == "broken-aux-model"
assert c._last_aux_model_failure_error is not None
assert "404" in c._last_aux_model_failure_error
def test_unknown_error_falls_back_to_main_and_succeeds(self):
"""Errors that don't match the 404/503/model_not_found fast-path
@@ -317,6 +323,10 @@ class TestSummaryFallbackToMainModel:
assert "model" not in mock_call.call_args_list[1].kwargs
assert result is not None
assert "summary via main model" in result
# Aux-model failure recorded despite successful recovery
assert c._last_aux_model_failure_model == "broken-aux-model"
assert c._last_aux_model_failure_error is not None
assert "400" in c._last_aux_model_failure_error
def test_no_fallback_when_summary_model_equals_main_model(self):
"""If the aux model IS the main model, there's nowhere to fall back
@@ -367,6 +377,97 @@ class TestSummaryFallbackToMainModel:
assert c._summary_model_fallen_back is True
class TestAuxModelFallbackSurfacedToCallers:
"""When summary_model fails but retry-on-main succeeds, compress() must
expose the aux-model failure via _last_aux_model_failure_{model,error}
so gateway /compress and CLI callers can warn the user about their
broken auxiliary.compression.model config — silent recovery would hide
a misconfiguration only the user can fix."""
def _make_msgs(self):
return [
{"role": "system", "content": "sys"},
{"role": "user", "content": "msg 1"},
{"role": "assistant", "content": "msg 2"},
{"role": "user", "content": "msg 3"},
{"role": "assistant", "content": "msg 4"},
{"role": "user", "content": "msg 5"},
{"role": "assistant", "content": "msg 6"},
{"role": "user", "content": "msg 7"},
]
def test_compress_exposes_aux_failure_fields_after_successful_fallback(self):
mock_ok = MagicMock()
mock_ok.choices = [MagicMock()]
mock_ok.choices[0].message.content = "summary via main"
err_400 = Exception("400 provider rejected configured model")
err_400.status_code = 400
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
c = ContextCompressor(
model="main-model",
summary_model_override="broken-aux-model",
quiet_mode=True,
protect_first_n=2,
protect_last_n=2,
)
with patch(
"agent.context_compressor.call_llm",
side_effect=[err_400, mock_ok],
):
result = c.compress(self._make_msgs())
# Recovery succeeded → no fallback placeholder
assert c._last_summary_fallback_used is False
# But aux-model failure IS recorded for the gateway/CLI warning
assert c._last_aux_model_failure_model == "broken-aux-model"
assert c._last_aux_model_failure_error is not None
assert "400" in c._last_aux_model_failure_error
# Result is well-formed with a real summary, not a placeholder
assert any(
isinstance(m.get("content"), str) and "summary via main" in m["content"]
for m in result
)
def test_compress_clears_aux_failure_fields_at_start_of_next_call(self):
"""A subsequent successful compression must clear the aux-failure
fields so the warning doesn't persist forever."""
mock_ok = MagicMock()
mock_ok.choices = [MagicMock()]
mock_ok.choices[0].message.content = "summary via main"
err_400 = Exception("400 aux model busted")
err_400.status_code = 400
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
c = ContextCompressor(
model="main-model",
summary_model_override="broken-aux-model",
quiet_mode=True,
protect_first_n=2,
protect_last_n=2,
)
# Call 1: aux fails, retry-on-main succeeds
with patch(
"agent.context_compressor.call_llm",
side_effect=[err_400, mock_ok],
):
c.compress(self._make_msgs())
assert c._last_aux_model_failure_model == "broken-aux-model"
# Call 2: clean run on main (summary_model was cleared to "" after
# first fallback). Aux-failure fields MUST reset at compress() start
# so the old warning state doesn't leak into this call.
with patch(
"agent.context_compressor.call_llm",
return_value=mock_ok,
):
c.compress(self._make_msgs())
assert c._last_aux_model_failure_model is None
assert c._last_aux_model_failure_error is None
class TestSummaryFailureTrackingForGatewayWarning:
"""When summary generation fails, the compressor must record dropped count
+ fallback flag so gateway hygiene & /compress can surface a visible

View File

@@ -181,3 +181,65 @@ async def test_compress_command_appends_warning_when_summary_generation_fails():
assert "historical message(s) were removed" in result
agent_instance.shutdown_memory_provider.assert_called_once()
agent_instance.close.assert_called_once()
@pytest.mark.asyncio
async def test_compress_command_surfaces_aux_model_failure_even_when_recovered():
"""When the user's configured ``auxiliary.compression.model`` errors out
but compression recovers by retrying on the main model, /compress must
STILL inform the user. Silent recovery hides broken config the user
needs to fix."""
history = _make_history()
# Compressed transcript — normal successful compression, no placeholder.
compressed = [
history[0],
{"role": "assistant", "content": "summary via main model"},
history[-1],
]
runner = _make_runner(history)
agent_instance = MagicMock()
agent_instance.shutdown_memory_provider = MagicMock()
agent_instance.close = MagicMock()
agent_instance.context_compressor.has_content_to_compress.return_value = True
# Fallback placeholder was NOT used — recovery succeeded.
agent_instance.context_compressor._last_summary_fallback_used = False
agent_instance.context_compressor._last_summary_dropped_count = 0
agent_instance.context_compressor._last_summary_error = None
# But the configured aux model DID fail before the retry succeeded.
agent_instance.context_compressor._last_aux_model_failure_model = (
"gemini-3-flash-preview"
)
agent_instance.context_compressor._last_aux_model_failure_error = (
"404 model not found: gemini-3-flash-preview"
)
agent_instance.session_id = "sess-1"
agent_instance._compress_context.return_value = (compressed, "")
def _estimate(messages):
if messages == history:
return 100
if messages == compressed:
return 60
raise AssertionError(f"unexpected transcript: {messages!r}")
with (
patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "***"}),
patch("gateway.run._resolve_gateway_model", return_value="test-model"),
patch("run_agent.AIAgent", return_value=agent_instance),
patch("agent.model_metadata.estimate_messages_tokens_rough", side_effect=_estimate),
):
result = await runner._handle_compress_command(_make_event())
# Compression succeeded
assert "Compressed:" in result
# No ⚠️ warning (that's reserved for dropped-turns case)
assert "⚠️" not in result
# But there IS an info note about the broken aux model
assert "" in result
assert "gemini-3-flash-preview" in result
assert "404" in result
assert "auxiliary.compression.model" in result
# The user's context is explicitly called out as intact
assert "intact" in result
agent_instance.shutdown_memory_provider.assert_called_once()
agent_instance.close.assert_called_once()

View File

@@ -508,4 +508,128 @@ async def test_session_hygiene_warns_user_when_summary_generation_fails(monkeypa
assert warn["chat_id"] == "-1001"
assert warn["metadata"] == {"thread_id": "17585"}
FakeCompressAgentWithSummaryFailure.last_instance.close.assert_called_once()
FakeCompressAgentWithSummaryFailure.last_instance.close.assert_called_once()
@pytest.mark.asyncio
async def test_session_hygiene_informs_user_when_aux_model_fails_but_recovers(monkeypatch, tmp_path):
"""When the user's configured ``auxiliary.compression.model`` errors out
and we recover via the main model, compression succeeds but the user's
config is still broken. Gateway hygiene must surface an note so the
user knows to fix ``auxiliary.compression.model`` — silent recovery
hides a misconfig only they can resolve."""
fake_dotenv = types.ModuleType("dotenv")
fake_dotenv.load_dotenv = lambda *args, **kwargs: None
monkeypatch.setitem(sys.modules, "dotenv", fake_dotenv)
class FakeCompressAgentWithAuxRecovery:
last_instance = None
def __init__(self, **kwargs):
self.model = kwargs.get("model")
self.session_id = kwargs.get("session_id", "fake-session")
self._print_fn = None
self.shutdown_memory_provider = MagicMock()
self.close = MagicMock()
# Compression succeeded (no placeholder inserted) but the
# configured aux model errored and we fell back to main.
self.context_compressor = SimpleNamespace(
_last_summary_fallback_used=False,
_last_summary_dropped_count=0,
_last_summary_error=None,
_last_aux_model_failure_model="gemini-3-flash-preview",
_last_aux_model_failure_error="404 model not found",
)
type(self).last_instance = self
def _compress_context(self, messages, *_args, **_kwargs):
self.session_id = f"{self.session_id}_compressed"
return ([{"role": "assistant", "content": "real summary"}], None)
fake_run_agent = types.ModuleType("run_agent")
fake_run_agent.AIAgent = FakeCompressAgentWithAuxRecovery
monkeypatch.setitem(sys.modules, "run_agent", fake_run_agent)
gateway_run = importlib.import_module("gateway.run")
GatewayRunner = gateway_run.GatewayRunner
adapter = HygieneCaptureAdapter()
runner = object.__new__(GatewayRunner)
runner.config = GatewayConfig(
platforms={Platform.TELEGRAM: PlatformConfig(enabled=True, token="fake-token")}
)
runner.adapters = {Platform.TELEGRAM: adapter}
runner._voice_mode = {}
runner.hooks = SimpleNamespace(emit=AsyncMock(), loaded_hooks=False)
runner.session_store = MagicMock()
runner.session_store.get_or_create_session.return_value = SessionEntry(
session_key="agent:main:telegram:group:-1001:17585",
session_id="sess-1",
created_at=datetime.now(),
updated_at=datetime.now(),
platform=Platform.TELEGRAM,
chat_type="group",
)
runner.session_store.load_transcript.return_value = _make_history(6, content_size=400)
runner.session_store.has_any_sessions.return_value = True
runner.session_store.rewrite_transcript = MagicMock()
runner.session_store.append_to_transcript = MagicMock()
runner._running_agents = {}
runner._pending_messages = {}
runner._pending_approvals = {}
runner._session_db = None
runner._is_user_authorized = lambda _source: True
runner._set_session_env = lambda _context: None
runner._run_agent = AsyncMock(
return_value={
"final_response": "ok",
"messages": [],
"tools": [],
"history_offset": 0,
"last_prompt_tokens": 0,
}
)
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
monkeypatch.setattr(gateway_run, "_resolve_runtime_agent_kwargs", lambda: {"api_key": "***"})
monkeypatch.setattr(
"agent.model_metadata.get_model_context_length",
lambda *_args, **_kwargs: 100,
)
monkeypatch.setenv("TELEGRAM_HOME_CHANNEL", "795544298")
event = MessageEvent(
text="hello",
source=SessionSource(
platform=Platform.TELEGRAM,
chat_id="-1001",
chat_type="group",
thread_id="17585",
user_id="12345",
),
message_id="1",
)
result = await runner._handle_message(event)
assert result == "ok"
# No ⚠️ hard-failure warning (that's for dropped turns)
hard_warnings = [s for s in adapter.sent if "Context compression summary failed" in s["content"]]
assert len(hard_warnings) == 0, adapter.sent
# But an note about the configured aux model must be delivered.
aux_notes = [
s for s in adapter.sent
if "Configured compression model" in s["content"]
]
assert len(aux_notes) == 1, (
f"Expected 1 aux-model fallback notice, got {len(aux_notes)}: {adapter.sent}"
)
note = aux_notes[0]
assert "gemini-3-flash-preview" in note["content"]
assert "404" in note["content"]
assert "auxiliary.compression.model" in note["content"]
# Note must land in the originating topic/thread.
assert note["chat_id"] == "-1001"
assert note["metadata"] == {"thread_id": "17585"}
FakeCompressAgentWithAuxRecovery.last_instance.close.assert_called_once()