From 12b109b6640a573abf685d3c881cab2a9fc5c3aa Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Thu, 16 Apr 2026 02:32:21 -0700 Subject: [PATCH] fix: enable TCP keepalives to detect dead provider connections (#10324) (#10933) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a custom provider drops a connection mid-stream, the TCP socket can enter CLOSE-WAIT and the httpx read timeout may never fire — epoll_wait blocks indefinitely because no data or error signal arrives. The agent hangs until manually killed. The existing defenses (httpx read timeout, stale stream detector, _force_close_tcp_sockets) are all time-based and work correctly once triggered, but they rely on the socket layer reporting the dead connection. Without TCP keepalives, the kernel has no reason to probe a silent connection. Fix: inject SO_KEEPALIVE + TCP_KEEPIDLE/KEEPINTVL/KEEPCNT into the httpx transport via socket_options. The kernel probes idle connections after 30s, retries every 10s, gives up after 3 failures — dead peer detected within ~60s instead of hanging forever. Platform-aware: uses TCP_KEEPIDLE on Linux, TCP_KEEPALIVE on macOS. Falls back silently if socket options aren't available (Windows, etc.). Closes #10324 --- run_agent.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/run_agent.py b/run_agent.py index cb5dbf4b1..944217e6b 100644 --- a/run_agent.py +++ b/run_agent.py @@ -4366,6 +4366,29 @@ class AIAgent: self._client_log_context(), ) return client + # Inject TCP keepalives to detect dead connections faster (#10324). + # Without keepalives, a provider that drops mid-stream leaves the + # socket in CLOSE-WAIT and epoll_wait may never fire, causing the + # agent to hang indefinitely. Keepalive probes detect the dead + # peer within ~60s (30s idle + 3×10s probes). + if "http_client" not in client_kwargs: + try: + import httpx as _httpx + import socket as _socket + _sock_opts = [(_socket.SOL_SOCKET, _socket.SO_KEEPALIVE, 1)] + if hasattr(_socket, "TCP_KEEPIDLE"): + # Linux + _sock_opts.append((_socket.IPPROTO_TCP, _socket.TCP_KEEPIDLE, 30)) + _sock_opts.append((_socket.IPPROTO_TCP, _socket.TCP_KEEPINTVL, 10)) + _sock_opts.append((_socket.IPPROTO_TCP, _socket.TCP_KEEPCNT, 3)) + elif hasattr(_socket, "TCP_KEEPALIVE"): + # macOS (uses TCP_KEEPALIVE instead of TCP_KEEPIDLE) + _sock_opts.append((_socket.IPPROTO_TCP, _socket.TCP_KEEPALIVE, 30)) + client_kwargs["http_client"] = _httpx.Client( + transport=_httpx.HTTPTransport(socket_options=_sock_opts), + ) + except Exception: + pass # Fall through to default transport if socket opts fail client = OpenAI(**client_kwargs) logger.info( "OpenAI client created (%s, shared=%s) %s",