fix(gateway): look up expired agents in _agent_cache, add global kill_all

Two fixes from PR review:

1. Session expiry was looking in _running_agents for the cached agent,
   but idle expired sessions live in _agent_cache. Now checks
   _agent_cache first, falls back to _running_agents.

2. Global cleanup in stop() was missing process_registry.kill_all(),
   so background processes from agents evicted without close() (branch,
   fallback) survived shutdown.
This commit is contained in:
pefontana
2026-04-10 17:26:10 -03:00
committed by Teknium
parent f00dd3169f
commit 9555a0cf31

View File

@@ -1348,18 +1348,28 @@ class GatewayRunner:
for key, entry in _expired_entries:
try:
await self._async_flush_memories(entry.session_id)
# Shut down memory provider on the cached agent
cached_agent = self._running_agents.get(key)
if cached_agent and cached_agent is not _AGENT_PENDING_SENTINEL:
# Shut down memory provider and close tool resources
# on the cached agent. Idle agents live in
# _agent_cache (not _running_agents), so look there.
_cached_agent = None
_cache_lock = getattr(self, "_agent_cache_lock", None)
if _cache_lock is not None:
with _cache_lock:
_cached = self._agent_cache.get(key)
_cached_agent = _cached[0] if isinstance(_cached, tuple) else _cached if _cached else None
# Fall back to _running_agents in case the agent is
# still mid-turn when the expiry fires.
if _cached_agent is None:
_cached_agent = self._running_agents.get(key)
if _cached_agent and _cached_agent is not _AGENT_PENDING_SENTINEL:
try:
if hasattr(cached_agent, 'shutdown_memory_provider'):
cached_agent.shutdown_memory_provider()
if hasattr(_cached_agent, 'shutdown_memory_provider'):
_cached_agent.shutdown_memory_provider()
except Exception:
pass
# Close tool resources to prevent zombie processes
try:
if hasattr(cached_agent, 'close'):
cached_agent.close()
if hasattr(_cached_agent, 'close'):
_cached_agent.close()
except Exception:
pass
# Mark as flushed and persist to disk so the flag
@@ -1575,6 +1585,11 @@ class GatewayRunner:
# Global cleanup: kill any remaining tool subprocesses not tied
# to a specific agent (catch-all for zombie prevention).
try:
from tools.process_registry import process_registry
process_registry.kill_all()
except Exception:
pass
try:
from tools.terminal_tool import cleanup_all_environments
cleanup_all_environments()