From 708f187549d5127f94aacf6af839aabe5819094d Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sat, 28 Mar 2026 14:25:12 -0700 Subject: [PATCH] fix(gateway): exit with failure when all platforms fail with retryable errors (#3592) When all messaging platforms exhaust retries and get queued for background reconnection, exit with code 1 so systemd Restart=on-failure can restart the process. Previously the gateway stayed alive as a zombie with no connected platforms and exit code 0. Salvaged from PR #3567 by kelsia14. Test updates added. Co-authored-by: kelsia14 --- gateway/run.py | 20 +++++++++++--- tests/gateway/test_platform_reconnect.py | 32 ++++++++++++++++++++-- tests/gateway/test_runner_fatal_adapter.py | 5 ++-- 3 files changed, 48 insertions(+), 9 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index 847db36c9..f71fc2280 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -745,10 +745,22 @@ class GatewayRunner: logger.error("No connected messaging platforms remain. Shutting down gateway cleanly.") await self.stop() elif not self.adapters and self._failed_platforms: - logger.warning( - "No connected messaging platforms remain, but %d platform(s) queued for reconnection", - len(self._failed_platforms), - ) + # All platforms are down and queued for background reconnection. + # If the error is retryable, exit with failure so systemd Restart=on-failure + # can restart the process. Otherwise stay alive and keep retrying in background. + if adapter.fatal_error_retryable: + self._exit_reason = adapter.fatal_error_message or "All messaging platforms failed with retryable errors" + self._exit_with_failure = True + logger.error( + "All messaging platforms failed with retryable errors. " + "Shutting down gateway for service restart (systemd will retry)." + ) + await self.stop() + else: + logger.warning( + "No connected messaging platforms remain, but %d platform(s) queued for reconnection", + len(self._failed_platforms), + ) def _request_clean_exit(self, reason: str) -> None: self._exit_cleanly = True diff --git a/tests/gateway/test_platform_reconnect.py b/tests/gateway/test_platform_reconnect.py index 3073f2f5d..68dfd2044 100644 --- a/tests/gateway/test_platform_reconnect.py +++ b/tests/gateway/test_platform_reconnect.py @@ -344,6 +344,7 @@ class TestRuntimeDisconnectQueuing: async def test_retryable_runtime_error_queued_for_reconnect(self): """Retryable runtime errors should add the platform to _failed_platforms.""" runner = _make_runner() + runner.stop = AsyncMock() adapter = StubAdapter(succeed=True) adapter._set_fatal_error("network_error", "DNS failure", retryable=True) @@ -371,8 +372,12 @@ class TestRuntimeDisconnectQueuing: assert Platform.TELEGRAM not in runner._failed_platforms @pytest.mark.asyncio - async def test_retryable_error_prevents_shutdown_when_queued(self): - """Gateway should not shut down if failed platforms are queued for reconnection.""" + async def test_retryable_error_exits_for_service_restart_when_all_down(self): + """Gateway should exit with failure when all platforms fail with retryable errors. + + This lets systemd Restart=on-failure restart the process, which is more + reliable than in-process background reconnection after exhausted retries. + """ runner = _make_runner() runner.stop = AsyncMock() @@ -382,7 +387,28 @@ class TestRuntimeDisconnectQueuing: await runner._handle_adapter_fatal_error(adapter) - # stop() should NOT have been called since we have platforms queued + # stop() SHOULD be called — gateway exits for systemd restart + runner.stop.assert_called_once() + assert runner._exit_with_failure is True + assert Platform.TELEGRAM in runner._failed_platforms + + @pytest.mark.asyncio + async def test_retryable_error_no_exit_when_other_adapters_still_connected(self): + """Gateway should NOT exit if some adapters are still connected.""" + runner = _make_runner() + runner.stop = AsyncMock() + + failing_adapter = StubAdapter(succeed=True) + failing_adapter._set_fatal_error("network_error", "DNS failure", retryable=True) + runner.adapters[Platform.TELEGRAM] = failing_adapter + + # Another adapter is still connected + healthy_adapter = StubAdapter(succeed=True) + runner.adapters[Platform.DISCORD] = healthy_adapter + + await runner._handle_adapter_fatal_error(failing_adapter) + + # stop() should NOT have been called — Discord is still up runner.stop.assert_not_called() assert Platform.TELEGRAM in runner._failed_platforms diff --git a/tests/gateway/test_runner_fatal_adapter.py b/tests/gateway/test_runner_fatal_adapter.py index 6eb285059..13b9a7d99 100644 --- a/tests/gateway/test_runner_fatal_adapter.py +++ b/tests/gateway/test_runner_fatal_adapter.py @@ -89,7 +89,8 @@ async def test_runner_queues_retryable_runtime_fatal_for_reconnection(monkeypatc await runner._handle_adapter_fatal_error(adapter) - # Should NOT shut down — platform is queued for reconnection - runner.stop.assert_not_awaited() + # Should shut down with failure — systemd Restart=on-failure will restart + runner.stop.assert_awaited_once() + assert runner._exit_with_failure is True assert Platform.WHATSAPP in runner._failed_platforms assert runner._failed_platforms[Platform.WHATSAPP]["attempts"] == 0