fix: eliminate provider hang dead zones in retry/timeout architecture (#8985)

Three targeted changes to close the gaps between retry layers that caused users to experience 'No response from provider for 580s' and 'No activity for 15 minutes' despite having 5 layers of retry: 1. Remove non-streaming fallback from streaming path Previously, when all 3 stream retries exhausted, the code fell back to _interruptible_api_call() which had no stale detection and no activity tracking — a black hole that could hang for up to 1800s. Now errors propagate to the main retry loop which has richer recovery (credential rotation, provider fallback, backoff). For 'stream not supported' errors, sets _disable_streaming flag so the main retry loop automatically switches to non-streaming on the next attempt. 2. Add _touch_activity to recovery dead zones The gateway inactivity monitor relies on _touch_activity() to know the agent is alive, but activity was never touched during: - Stale stream detection/kill cycles (180-300s gaps) - Stream retry connection rebuilds - Main retry backoff sleeps (up to 120s) - Error recovery classification Now all these paths touch activity every ~30s, keeping the gateway informed during recovery cycles. 3. Add stale-call detector to non-streaming path _interruptible_api_call() now has the same stale detection pattern as the streaming path: kills hung connections after 300s (default, configurable via HERMES_API_CALL_STALE_TIMEOUT), scaled for large contexts (450s for 50K+ tokens, 600s for 100K+ tokens), disabled for local providers. Also touches activity every ~30s during the wait so the gateway monitor stays informed. Env vars: - HERMES_API_CALL_STALE_TIMEOUT: non-streaming stale timeout (default 300s) - HERMES_STREAM_STALE_TIMEOUT: unchanged (default 180s) Before: worst case ~2+ hours of sequential retries with no feedback After: worst case bounded by gateway inactivity timeout (default 1800s) with continuous activity reporting
2026-04-13 04:55:20 -07:00
parent acdff020b7
commit a5bd56eae3
3 changed files with 162 additions and 140 deletions
--- a/tests/run_agent/test_run_agent.py
+++ b/tests/run_agent/test_run_agent.py
@@ -3519,8 +3519,8 @@ class TestStreamingApiCall:
        call_kwargs = agent.client.chat.completions.create.call_args
        assert call_kwargs[1].get("stream") is True or call_kwargs.kwargs.get("stream") is True

-    def test_api_exception_falls_back_to_non_streaming(self, agent):
-        """When streaming fails before any deltas, fallback to non-streaming is attempted."""
+    def test_api_exception_propagates_no_non_streaming_fallback(self, agent):
+        """When streaming fails before any deltas, error propagates to the main retry loop."""
        agent.client.chat.completions.create.side_effect = ConnectionError("fail")
        # Prevent stream retry logic from replacing the mock client
        with patch.object(agent, "_replace_primary_openai_client", return_value=False):
--- a/tests/run_agent/test_streaming.py
+++ b/tests/run_agent/test_streaming.py
@@ -374,13 +374,19 @@ class TestStreamingCallbacks:


 class TestStreamingFallback:
-    """Verify fallback to non-streaming on ANY streaming error."""
+    """Verify streaming errors propagate to the main retry loop.
+
+    Previously, streaming errors triggered an inline fallback to
+    non-streaming.  Now they propagate so the main retry loop can apply
+    richer recovery (credential rotation, provider fallback, backoff).
+    The only special case: 'stream not supported' sets _disable_streaming
+    so the *next* main-loop retry uses non-streaming automatically.
+    """

-    @patch("run_agent.AIAgent._interruptible_api_call")
    @patch("run_agent.AIAgent._create_request_openai_client")
    @patch("run_agent.AIAgent._close_request_openai_client")
-    def test_stream_error_falls_back(self, mock_close, mock_create, mock_non_stream):
-        """'not supported' error triggers fallback to non-streaming."""
+    def test_stream_not_supported_sets_flag_and_raises(self, mock_close, mock_create):
+        """'not supported' error sets _disable_streaming and propagates."""
        from run_agent import AIAgent

        mock_client = MagicMock()
@@ -389,23 +395,6 @@ class TestStreamingFallback:
        )
        mock_create.return_value = mock_client

-        fallback_response = SimpleNamespace(
-            id="fallback",
-            model="test",
-            choices=[SimpleNamespace(
-                index=0,
-                message=SimpleNamespace(
-                    role="assistant",
-                    content="fallback response",
-                    tool_calls=None,
-                    reasoning_content=None,
-                ),
-                finish_reason="stop",
-            )],
-            usage=None,
-        )
-        mock_non_stream.return_value = fallback_response
-
        agent = AIAgent(
            model="test/model",
            quiet_mode=True,
@@ -415,16 +404,16 @@ class TestStreamingFallback:
        agent.api_mode = "chat_completions"
        agent._interrupt_requested = False

-        response = agent._interruptible_streaming_api_call({})
+        with pytest.raises(Exception, match="Streaming is not supported"):
+            agent._interruptible_streaming_api_call({})

-        assert response.choices[0].message.content == "fallback response"
-        mock_non_stream.assert_called_once()
+        # The flag should be set so the main retry loop switches to non-streaming
+        assert agent._disable_streaming is True

-    @patch("run_agent.AIAgent._interruptible_api_call")
    @patch("run_agent.AIAgent._create_request_openai_client")
    @patch("run_agent.AIAgent._close_request_openai_client")
-    def test_any_stream_error_falls_back(self, mock_close, mock_create, mock_non_stream):
-        """ANY streaming error triggers fallback — not just specific messages."""
+    def test_non_transport_error_propagates(self, mock_close, mock_create):
+        """Non-transport streaming errors propagate to the main retry loop."""
        from run_agent import AIAgent

        mock_client = MagicMock()
@@ -433,23 +422,6 @@ class TestStreamingFallback:
        )
        mock_create.return_value = mock_client

-        fallback_response = SimpleNamespace(
-            id="fallback",
-            model="test",
-            choices=[SimpleNamespace(
-                index=0,
-                message=SimpleNamespace(
-                    role="assistant",
-                    content="fallback after connection error",
-                    tool_calls=None,
-                    reasoning_content=None,
-                ),
-                finish_reason="stop",
-            )],
-            usage=None,
-        )
-        mock_non_stream.return_value = fallback_response
-
        agent = AIAgent(
            model="test/model",
            quiet_mode=True,
@@ -459,24 +431,19 @@ class TestStreamingFallback:
        agent.api_mode = "chat_completions"
        agent._interrupt_requested = False

-        response = agent._interruptible_streaming_api_call({})
+        with pytest.raises(Exception, match="Connection reset by peer"):
+            agent._interruptible_streaming_api_call({})

-        assert response.choices[0].message.content == "fallback after connection error"
-        mock_non_stream.assert_called_once()
-
-    @patch("run_agent.AIAgent._interruptible_api_call")
    @patch("run_agent.AIAgent._create_request_openai_client")
    @patch("run_agent.AIAgent._close_request_openai_client")
-    def test_fallback_error_propagates(self, mock_close, mock_create, mock_non_stream):
-        """When both streaming AND fallback fail, the fallback error propagates."""
+    def test_stream_error_propagates_original(self, mock_close, mock_create):
+        """The original streaming error propagates (not a fallback error)."""
        from run_agent import AIAgent

        mock_client = MagicMock()
        mock_client.chat.completions.create.side_effect = Exception("stream broke")
        mock_create.return_value = mock_client

-        mock_non_stream.side_effect = Exception("Rate limit exceeded")
-
        agent = AIAgent(
            model="test/model",
            quiet_mode=True,
@@ -486,14 +453,13 @@ class TestStreamingFallback:
        agent.api_mode = "chat_completions"
        agent._interrupt_requested = False

-        with pytest.raises(Exception, match="Rate limit exceeded"):
+        with pytest.raises(Exception, match="stream broke"):
            agent._interruptible_streaming_api_call({})

-    @patch("run_agent.AIAgent._interruptible_api_call")
    @patch("run_agent.AIAgent._create_request_openai_client")
    @patch("run_agent.AIAgent._close_request_openai_client")
-    def test_exhausted_transient_stream_error_falls_back(self, mock_close, mock_create, mock_non_stream):
-        """Transient stream errors retry first, then fall back after retries are exhausted."""
+    def test_exhausted_transient_stream_error_propagates(self, mock_close, mock_create):
+        """Transient stream errors retry first, then propagate after retries exhausted."""
        from run_agent import AIAgent
        import httpx

@@ -501,23 +467,6 @@ class TestStreamingFallback:
        mock_client.chat.completions.create.side_effect = httpx.ConnectError("socket closed")
        mock_create.return_value = mock_client

-        fallback_response = SimpleNamespace(
-            id="fallback",
-            model="test",
-            choices=[SimpleNamespace(
-                index=0,
-                message=SimpleNamespace(
-                    role="assistant",
-                    content="fallback after retries exhausted",
-                    tool_calls=None,
-                    reasoning_content=None,
-                ),
-                finish_reason="stop",
-            )],
-            usage=None,
-        )
-        mock_non_stream.return_value = fallback_response
-
        agent = AIAgent(
            model="test/model",
            quiet_mode=True,
@@ -527,23 +476,22 @@ class TestStreamingFallback:
        agent.api_mode = "chat_completions"
        agent._interrupt_requested = False

-        response = agent._interruptible_streaming_api_call({})
+        with pytest.raises(httpx.ConnectError, match="socket closed"):
+            agent._interruptible_streaming_api_call({})

-        assert response.choices[0].message.content == "fallback after retries exhausted"
+        # Should have retried 3 times (default HERMES_STREAM_RETRIES=2 → 3 attempts)
        assert mock_client.chat.completions.create.call_count == 3
-        mock_non_stream.assert_called_once()
        assert mock_close.call_count >= 1

-    @patch("run_agent.AIAgent._interruptible_api_call")
    @patch("run_agent.AIAgent._create_request_openai_client")
    @patch("run_agent.AIAgent._close_request_openai_client")
-    def test_sse_connection_lost_retried_as_transient(self, mock_close, mock_create, mock_non_stream):
+    def test_sse_connection_lost_retried_as_transient(self, mock_close, mock_create):
        """SSE 'Network connection lost' (APIError w/ no status_code) retries like httpx errors.

        OpenRouter sends {"error":{"message":"Network connection lost."}} as an SSE
        event when the upstream stream drops.  The OpenAI SDK raises APIError from
        this.  It should be retried at the streaming level, same as httpx connection
-        errors, before falling back to non-streaming.
+        errors, then propagate to the main retry loop after exhaustion.
        """
        from run_agent import AIAgent
        import httpx
@@ -561,23 +509,6 @@ class TestStreamingFallback:
        mock_client.chat.completions.create.side_effect = sse_error
        mock_create.return_value = mock_client

-        fallback_response = SimpleNamespace(
-            id="fallback",
-            model="test",
-            choices=[SimpleNamespace(
-                index=0,
-                message=SimpleNamespace(
-                    role="assistant",
-                    content="fallback after SSE retries",
-                    tool_calls=None,
-                    reasoning_content=None,
-                ),
-                finish_reason="stop",
-            )],
-            usage=None,
-        )
-        mock_non_stream.return_value = fallback_response
-
        agent = AIAgent(
            model="test/model",
            quiet_mode=True,
@@ -587,21 +518,18 @@ class TestStreamingFallback:
        agent.api_mode = "chat_completions"
        agent._interrupt_requested = False

-        response = agent._interruptible_streaming_api_call({})
+        with pytest.raises(OAIAPIError):
+            agent._interruptible_streaming_api_call({})

-        assert response.choices[0].message.content == "fallback after SSE retries"
        # Should retry 3 times (default HERMES_STREAM_RETRIES=2 → 3 attempts)
-        # before falling back to non-streaming
        assert mock_client.chat.completions.create.call_count == 3
-        mock_non_stream.assert_called_once()
        # Connection cleanup should happen for each failed retry
        assert mock_close.call_count >= 2

-    @patch("run_agent.AIAgent._interruptible_api_call")
    @patch("run_agent.AIAgent._create_request_openai_client")
    @patch("run_agent.AIAgent._close_request_openai_client")
-    def test_sse_non_connection_error_falls_back_immediately(self, mock_close, mock_create, mock_non_stream):
-        """SSE errors that aren't connection-related still fall back immediately (no stream retry)."""
+    def test_sse_non_connection_error_propagates_immediately(self, mock_close, mock_create):
+        """SSE errors that aren't connection-related propagate immediately (no stream retry)."""
        from run_agent import AIAgent
        import httpx

@@ -616,23 +544,6 @@ class TestStreamingFallback:
        mock_client.chat.completions.create.side_effect = sse_error
        mock_create.return_value = mock_client

-        fallback_response = SimpleNamespace(
-            id="fallback",
-            model="test",
-            choices=[SimpleNamespace(
-                index=0,
-                message=SimpleNamespace(
-                    role="assistant",
-                    content="fallback no retry",
-                    tool_calls=None,
-                    reasoning_content=None,
-                ),
-                finish_reason="stop",
-            )],
-            usage=None,
-        )
-        mock_non_stream.return_value = fallback_response
-
        agent = AIAgent(
            model="test/model",
            quiet_mode=True,
@@ -642,12 +553,11 @@ class TestStreamingFallback:
        agent.api_mode = "chat_completions"
        agent._interrupt_requested = False

-        response = agent._interruptible_streaming_api_call({})
+        with pytest.raises(OAIAPIError):
+            agent._interruptible_streaming_api_call({})

-        assert response.choices[0].message.content == "fallback no retry"
-        # Should NOT retry — goes straight to non-streaming fallback
+        # Should NOT retry — propagates immediately
        assert mock_client.chat.completions.create.call_count == 1
-        mock_non_stream.assert_called_once()


 # ── Test: Reasoning Streaming ────────────────────────────────────────────