fix(environments): prevent terminal hang when commands background children (#8340)
When a user's command backgrounds a child (`cmd &`, `setsid cmd & disown`, etc.), the backgrounded grandchild inherits the write-end of our stdout pipe via fork(). The old `for line in proc.stdout` drain never EOF'd until the grandchild closed the pipe — so for a uvicorn server, the terminal tool hung indefinitely (users reported the whole session deadlocking when asking the agent to restart a backend). Fix: switch _drain() to select()-based non-blocking reads and stop draining shortly after bash exits even if the pipe hasn't EOF'd. Any output the grandchild writes after that point goes to an orphaned pipe, which is exactly what the user asked for when they said '&'. Adds regression tests covering the issue's exact repro and 5 related patterns (plain bg, setsid+disown, streaming output, high volume, timeout, UTF-8).
This commit is contained in:
111
tests/tools/test_local_background_child_hang.py
Normal file
111
tests/tools/test_local_background_child_hang.py
Normal file
@@ -0,0 +1,111 @@
|
||||
"""Regression tests for issue #8340.
|
||||
|
||||
When a user command backgrounds a child process (``cmd &``, ``setsid cmd &
|
||||
disown``, etc.), the backgrounded grandchild inherits the write-end of our
|
||||
stdout pipe via fork(). Before the fix, the drain thread's blocking
|
||||
``for line in proc.stdout`` would never see EOF until that grandchild
|
||||
closed the pipe — causing the terminal tool to hang for the full lifetime
|
||||
of the backgrounded service (indefinitely for a uvicorn server).
|
||||
|
||||
The fix switches ``_drain()`` to select()-based non-blocking reads and
|
||||
stops draining shortly after bash exits even if the pipe hasn't EOF'd.
|
||||
"""
|
||||
import json
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
import pytest
|
||||
|
||||
from tools.environments.local import LocalEnvironment
|
||||
|
||||
|
||||
def _pkill(pattern: str) -> None:
|
||||
subprocess.run(f"pkill -9 -f {pattern!r} 2>/dev/null", shell=True)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def local_env():
|
||||
env = LocalEnvironment(cwd="/tmp")
|
||||
try:
|
||||
yield env
|
||||
finally:
|
||||
env.cleanup()
|
||||
|
||||
|
||||
class TestBackgroundChildDoesNotHang:
|
||||
"""Regression guard for issue #8340."""
|
||||
|
||||
def test_plain_background_returns_promptly(self, local_env):
|
||||
"""``cmd &`` with no output redirection must not hang on pipe inherit."""
|
||||
marker = "hermes_8340_plain_bg"
|
||||
cmd = f'python3 -c "import time; time.sleep(60)" & echo {marker}'
|
||||
try:
|
||||
t0 = time.monotonic()
|
||||
result = local_env.execute(cmd, timeout=15)
|
||||
elapsed = time.monotonic() - t0
|
||||
|
||||
assert elapsed < 4.0, (
|
||||
f"terminal_tool hung for {elapsed:.1f}s — drain thread "
|
||||
f"is still blocking on backgrounded child's inherited pipe fd"
|
||||
)
|
||||
assert result["returncode"] == 0
|
||||
assert marker in result["output"]
|
||||
finally:
|
||||
_pkill("time.sleep(60)")
|
||||
|
||||
def test_setsid_disown_pattern_returns_promptly(self, local_env):
|
||||
"""The exact pattern from the issue: setsid ... & disown."""
|
||||
cmd = (
|
||||
'setsid python3 -c "import time; time.sleep(60)" '
|
||||
'> /dev/null 2>&1 < /dev/null & disown; echo started'
|
||||
)
|
||||
try:
|
||||
t0 = time.monotonic()
|
||||
result = local_env.execute(cmd, timeout=15)
|
||||
elapsed = time.monotonic() - t0
|
||||
|
||||
assert elapsed < 4.0, f"setsid+disown path hung for {elapsed:.1f}s"
|
||||
assert result["returncode"] == 0
|
||||
assert "started" in result["output"]
|
||||
finally:
|
||||
_pkill("time.sleep(60)")
|
||||
|
||||
def test_foreground_streaming_output_still_captured(self, local_env):
|
||||
"""Sanity: incremental output over time must still be captured in full."""
|
||||
cmd = 'for i in 1 2 3; do echo "tick $i"; sleep 0.2; done; echo done'
|
||||
t0 = time.monotonic()
|
||||
result = local_env.execute(cmd, timeout=10)
|
||||
elapsed = time.monotonic() - t0
|
||||
|
||||
# Loop body sleeps ~0.6s total — elapsed should be close to that.
|
||||
assert 0.5 < elapsed < 3.0
|
||||
assert result["returncode"] == 0
|
||||
for expected in ("tick 1", "tick 2", "tick 3", "done"):
|
||||
assert expected in result["output"], f"missing {expected!r}"
|
||||
|
||||
def test_high_volume_output_complete(self, local_env):
|
||||
"""Sanity: select-based drain must not drop lines under load."""
|
||||
result = local_env.execute("seq 1 3000", timeout=10)
|
||||
lines = result["output"].strip().split("\n")
|
||||
assert result["returncode"] == 0
|
||||
assert len(lines) == 3000
|
||||
assert lines[0] == "1"
|
||||
assert lines[-1] == "3000"
|
||||
|
||||
def test_timeout_path_still_works(self, local_env):
|
||||
"""Foreground command exceeding timeout must still be killed."""
|
||||
t0 = time.monotonic()
|
||||
result = local_env.execute("sleep 30", timeout=2)
|
||||
elapsed = time.monotonic() - t0
|
||||
|
||||
assert elapsed < 4.0
|
||||
assert result["returncode"] == 124
|
||||
assert "timed out" in result["output"].lower()
|
||||
|
||||
def test_utf8_output_decoded_correctly(self, local_env):
|
||||
"""Multibyte UTF-8 chunks must decode cleanly under select-based reads."""
|
||||
result = local_env.execute("echo 日本語 café résumé", timeout=5)
|
||||
assert result["returncode"] == 0
|
||||
assert "日本語" in result["output"]
|
||||
assert "café" in result["output"]
|
||||
assert "résumé" in result["output"]
|
||||
@@ -9,6 +9,7 @@ or a temp file (local).
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import select
|
||||
import shlex
|
||||
import subprocess
|
||||
import threading
|
||||
@@ -436,17 +437,53 @@ class BaseEnvironment(ABC):
|
||||
"""
|
||||
output_chunks: list[str] = []
|
||||
|
||||
# Non-blocking drain via select().
|
||||
#
|
||||
# The old pattern — ``for line in proc.stdout`` — blocks on
|
||||
# ``readline()`` until the pipe reaches EOF. When the user's command
|
||||
# backgrounds a process (``cmd &``, ``setsid cmd & disown``, etc.),
|
||||
# that backgrounded grandchild inherits the write-end of our stdout
|
||||
# pipe via ``fork()``. Even after ``bash`` itself exits, the pipe
|
||||
# stays open because the grandchild still holds it — so the drain
|
||||
# thread never returns and the tool hangs for the full lifetime of
|
||||
# the grandchild (issue #8340: users reported indefinite hangs when
|
||||
# restarting uvicorn with ``setsid ... & disown``).
|
||||
#
|
||||
# The fix: select() with a short poll interval, and stop draining
|
||||
# shortly after ``bash`` exits even if the pipe hasn't EOF'd yet.
|
||||
# Any output the grandchild writes after that point goes to an
|
||||
# orphaned pipe (harmless — the kernel reaps it when our end closes).
|
||||
def _drain():
|
||||
try:
|
||||
for line in proc.stdout:
|
||||
output_chunks.append(line)
|
||||
except UnicodeDecodeError:
|
||||
output_chunks.clear()
|
||||
output_chunks.append(
|
||||
"[binary output detected — raw bytes not displayable]"
|
||||
)
|
||||
except (ValueError, OSError):
|
||||
pass
|
||||
fd = proc.stdout.fileno()
|
||||
idle_after_exit = 0
|
||||
while True:
|
||||
try:
|
||||
ready, _, _ = select.select([fd], [], [], 0.1)
|
||||
except (ValueError, OSError):
|
||||
break # fd already closed
|
||||
if ready:
|
||||
try:
|
||||
chunk = os.read(fd, 4096)
|
||||
except (ValueError, OSError):
|
||||
break
|
||||
if not chunk:
|
||||
break # true EOF — all writers closed
|
||||
try:
|
||||
output_chunks.append(chunk.decode("utf-8"))
|
||||
except UnicodeDecodeError:
|
||||
output_chunks.clear()
|
||||
output_chunks.append(
|
||||
"[binary output detected — raw bytes not displayable]"
|
||||
)
|
||||
break
|
||||
idle_after_exit = 0
|
||||
elif proc.poll() is not None:
|
||||
# bash is gone and the pipe was idle for ~100ms. Give
|
||||
# it two more cycles to catch any buffered tail, then
|
||||
# stop — otherwise we wait forever on a grandchild pipe.
|
||||
idle_after_exit += 1
|
||||
if idle_after_exit >= 3:
|
||||
break
|
||||
|
||||
drain_thread = threading.Thread(target=_drain, daemon=True)
|
||||
drain_thread.start()
|
||||
@@ -553,7 +590,10 @@ class BaseEnvironment(ABC):
|
||||
pass # cleanup is best-effort
|
||||
raise
|
||||
|
||||
drain_thread.join(timeout=5)
|
||||
# Drain thread now exits promptly after bash does (~300ms idle
|
||||
# check). A short join is enough; a long one would be a bug since
|
||||
# it means the non-blocking loop itself stopped cooperating.
|
||||
drain_thread.join(timeout=2)
|
||||
|
||||
try:
|
||||
proc.stdout.close()
|
||||
|
||||
Reference in New Issue
Block a user