fix(environments): prevent terminal hang when commands background children (#8340)

When a user's command backgrounds a child (`cmd &`, `setsid cmd & disown`,
etc.), the backgrounded grandchild inherits the write-end of our stdout
pipe via fork(). The old `for line in proc.stdout` drain never EOF'd
until the grandchild closed the pipe — so for a uvicorn server, the
terminal tool hung indefinitely (users reported the whole session
deadlocking when asking the agent to restart a backend).

Fix: switch _drain() to select()-based non-blocking reads and stop
draining shortly after bash exits even if the pipe hasn't EOF'd. Any
output the grandchild writes after that point goes to an orphaned pipe,
which is exactly what the user asked for when they said '&'.

Adds regression tests covering the issue's exact repro and 5 related
patterns (plain bg, setsid+disown, streaming output, high volume,
timeout, UTF-8).
This commit is contained in:
Teknium
2026-04-19 11:12:39 -07:00
committed by Teknium
parent 611657487f
commit 0a02fbd842
2 changed files with 162 additions and 11 deletions

View File

@@ -0,0 +1,111 @@
"""Regression tests for issue #8340.
When a user command backgrounds a child process (``cmd &``, ``setsid cmd &
disown``, etc.), the backgrounded grandchild inherits the write-end of our
stdout pipe via fork(). Before the fix, the drain thread's blocking
``for line in proc.stdout`` would never see EOF until that grandchild
closed the pipe — causing the terminal tool to hang for the full lifetime
of the backgrounded service (indefinitely for a uvicorn server).
The fix switches ``_drain()`` to select()-based non-blocking reads and
stops draining shortly after bash exits even if the pipe hasn't EOF'd.
"""
import json
import subprocess
import time
import pytest
from tools.environments.local import LocalEnvironment
def _pkill(pattern: str) -> None:
subprocess.run(f"pkill -9 -f {pattern!r} 2>/dev/null", shell=True)
@pytest.fixture
def local_env():
env = LocalEnvironment(cwd="/tmp")
try:
yield env
finally:
env.cleanup()
class TestBackgroundChildDoesNotHang:
"""Regression guard for issue #8340."""
def test_plain_background_returns_promptly(self, local_env):
"""``cmd &`` with no output redirection must not hang on pipe inherit."""
marker = "hermes_8340_plain_bg"
cmd = f'python3 -c "import time; time.sleep(60)" & echo {marker}'
try:
t0 = time.monotonic()
result = local_env.execute(cmd, timeout=15)
elapsed = time.monotonic() - t0
assert elapsed < 4.0, (
f"terminal_tool hung for {elapsed:.1f}s — drain thread "
f"is still blocking on backgrounded child's inherited pipe fd"
)
assert result["returncode"] == 0
assert marker in result["output"]
finally:
_pkill("time.sleep(60)")
def test_setsid_disown_pattern_returns_promptly(self, local_env):
"""The exact pattern from the issue: setsid ... & disown."""
cmd = (
'setsid python3 -c "import time; time.sleep(60)" '
'> /dev/null 2>&1 < /dev/null & disown; echo started'
)
try:
t0 = time.monotonic()
result = local_env.execute(cmd, timeout=15)
elapsed = time.monotonic() - t0
assert elapsed < 4.0, f"setsid+disown path hung for {elapsed:.1f}s"
assert result["returncode"] == 0
assert "started" in result["output"]
finally:
_pkill("time.sleep(60)")
def test_foreground_streaming_output_still_captured(self, local_env):
"""Sanity: incremental output over time must still be captured in full."""
cmd = 'for i in 1 2 3; do echo "tick $i"; sleep 0.2; done; echo done'
t0 = time.monotonic()
result = local_env.execute(cmd, timeout=10)
elapsed = time.monotonic() - t0
# Loop body sleeps ~0.6s total — elapsed should be close to that.
assert 0.5 < elapsed < 3.0
assert result["returncode"] == 0
for expected in ("tick 1", "tick 2", "tick 3", "done"):
assert expected in result["output"], f"missing {expected!r}"
def test_high_volume_output_complete(self, local_env):
"""Sanity: select-based drain must not drop lines under load."""
result = local_env.execute("seq 1 3000", timeout=10)
lines = result["output"].strip().split("\n")
assert result["returncode"] == 0
assert len(lines) == 3000
assert lines[0] == "1"
assert lines[-1] == "3000"
def test_timeout_path_still_works(self, local_env):
"""Foreground command exceeding timeout must still be killed."""
t0 = time.monotonic()
result = local_env.execute("sleep 30", timeout=2)
elapsed = time.monotonic() - t0
assert elapsed < 4.0
assert result["returncode"] == 124
assert "timed out" in result["output"].lower()
def test_utf8_output_decoded_correctly(self, local_env):
"""Multibyte UTF-8 chunks must decode cleanly under select-based reads."""
result = local_env.execute("echo 日本語 café résumé", timeout=5)
assert result["returncode"] == 0
assert "日本語" in result["output"]
assert "café" in result["output"]
assert "résumé" in result["output"]

View File

@@ -9,6 +9,7 @@ or a temp file (local).
import json
import logging
import os
import select
import shlex
import subprocess
import threading
@@ -436,17 +437,53 @@ class BaseEnvironment(ABC):
"""
output_chunks: list[str] = []
# Non-blocking drain via select().
#
# The old pattern — ``for line in proc.stdout`` — blocks on
# ``readline()`` until the pipe reaches EOF. When the user's command
# backgrounds a process (``cmd &``, ``setsid cmd & disown``, etc.),
# that backgrounded grandchild inherits the write-end of our stdout
# pipe via ``fork()``. Even after ``bash`` itself exits, the pipe
# stays open because the grandchild still holds it — so the drain
# thread never returns and the tool hangs for the full lifetime of
# the grandchild (issue #8340: users reported indefinite hangs when
# restarting uvicorn with ``setsid ... & disown``).
#
# The fix: select() with a short poll interval, and stop draining
# shortly after ``bash`` exits even if the pipe hasn't EOF'd yet.
# Any output the grandchild writes after that point goes to an
# orphaned pipe (harmless — the kernel reaps it when our end closes).
def _drain():
try:
for line in proc.stdout:
output_chunks.append(line)
except UnicodeDecodeError:
output_chunks.clear()
output_chunks.append(
"[binary output detected — raw bytes not displayable]"
)
except (ValueError, OSError):
pass
fd = proc.stdout.fileno()
idle_after_exit = 0
while True:
try:
ready, _, _ = select.select([fd], [], [], 0.1)
except (ValueError, OSError):
break # fd already closed
if ready:
try:
chunk = os.read(fd, 4096)
except (ValueError, OSError):
break
if not chunk:
break # true EOF — all writers closed
try:
output_chunks.append(chunk.decode("utf-8"))
except UnicodeDecodeError:
output_chunks.clear()
output_chunks.append(
"[binary output detected — raw bytes not displayable]"
)
break
idle_after_exit = 0
elif proc.poll() is not None:
# bash is gone and the pipe was idle for ~100ms. Give
# it two more cycles to catch any buffered tail, then
# stop — otherwise we wait forever on a grandchild pipe.
idle_after_exit += 1
if idle_after_exit >= 3:
break
drain_thread = threading.Thread(target=_drain, daemon=True)
drain_thread.start()
@@ -553,7 +590,10 @@ class BaseEnvironment(ABC):
pass # cleanup is best-effort
raise
drain_thread.join(timeout=5)
# Drain thread now exits promptly after bash does (~300ms idle
# check). A short join is enough; a long one would be a bug since
# it means the non-blocking loop itself stopped cooperating.
drain_thread.join(timeout=2)
try:
proc.stdout.close()