fix(model_tools): cancel coroutine on timeout so worker thread exits + log full traceback
_run_async() bridges sync tool handlers to async code. When the handler
is invoked from inside a running event loop (gateway / nested async),
it spawns a worker thread and blocks on future.result(timeout=300).
Before this change, a coroutine that ran past 300s leaked its worker
thread:
- future.cancel() is a no-op on a running ThreadPoolExecutor future
(cancel only works on not-yet-started work).
- pool.shutdown(wait=False, cancel_futures=True) let the caller
proceed but the worker kept running the coroutine until it
returned on its own.
Every tool timeout leaked one thread. In long-lived gateway / RL
sessions this is cumulative.
The fix replaces bare asyncio.run() with a worker wrapper that
creates its own event loop. On timeout, _run_async schedules
task.cancel() on that loop via call_soon_threadsafe, then shuts the
pool down with wait=False so the caller returns immediately. The
coroutine observes CancelledError at its next await and the worker
thread exits cleanly.
Also switches logger.error() to logger.exception() in the top-level
handle_function_call() except block so tool failures produce full
stack traces in errors.log instead of just the message.
Related: #17420 (contributor flagged the leak; the original fix used
pool.shutdown(wait=True) which would have converted the leak into a
hang — caller blocks forever on the same stuck coroutine). Credit
for identifying the leak goes to the contributor.
Co-authored-by: 0z! <162235745+0z1-ghb@users.noreply.github.com>
This commit is contained in:
@@ -107,17 +107,58 @@ def _run_async(coro):
|
||||
loop = None
|
||||
|
||||
if loop and loop.is_running():
|
||||
# Inside an async context (gateway, RL env) — run in a fresh thread.
|
||||
# Inside an async context (gateway, RL env) — run in a fresh thread
|
||||
# with its own event loop we own a reference to, so on timeout we
|
||||
# can cancel the task inside that loop (ThreadPoolExecutor.cancel()
|
||||
# only works on not-yet-started futures — it's a no-op on a running
|
||||
# worker, which previously leaked the thread on every 300 s timeout).
|
||||
import concurrent.futures
|
||||
|
||||
worker_loop: Optional[asyncio.AbstractEventLoop] = None
|
||||
loop_ready = threading.Event()
|
||||
|
||||
def _run_in_worker():
|
||||
nonlocal worker_loop
|
||||
worker_loop = asyncio.new_event_loop()
|
||||
loop_ready.set()
|
||||
try:
|
||||
asyncio.set_event_loop(worker_loop)
|
||||
return worker_loop.run_until_complete(coro)
|
||||
finally:
|
||||
try:
|
||||
# Cancel anything still pending (e.g. task cancelled
|
||||
# externally via call_soon_threadsafe on timeout).
|
||||
pending = asyncio.all_tasks(worker_loop)
|
||||
for t in pending:
|
||||
t.cancel()
|
||||
if pending:
|
||||
worker_loop.run_until_complete(
|
||||
asyncio.gather(*pending, return_exceptions=True)
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
worker_loop.close()
|
||||
|
||||
pool = concurrent.futures.ThreadPoolExecutor(max_workers=1)
|
||||
future = pool.submit(asyncio.run, coro)
|
||||
future = pool.submit(_run_in_worker)
|
||||
try:
|
||||
return future.result(timeout=300)
|
||||
except concurrent.futures.TimeoutError:
|
||||
future.cancel()
|
||||
# Cancel the coroutine inside its own loop so the worker thread
|
||||
# can wind down instead of running forever.
|
||||
if loop_ready.wait(timeout=1.0) and worker_loop is not None:
|
||||
try:
|
||||
for t in asyncio.all_tasks(worker_loop):
|
||||
worker_loop.call_soon_threadsafe(t.cancel)
|
||||
except RuntimeError:
|
||||
# Loop already closed — nothing to cancel.
|
||||
pass
|
||||
raise
|
||||
finally:
|
||||
pool.shutdown(wait=False, cancel_futures=True)
|
||||
# wait=False: don't block the caller on a stuck coroutine. We've
|
||||
# already requested cancellation above; the worker will exit
|
||||
# once the coroutine observes it (usually at the next await).
|
||||
pool.shutdown(wait=False)
|
||||
|
||||
# If we're on a worker thread (e.g., parallel tool execution in
|
||||
# delegate_task), use a per-thread persistent loop. This avoids
|
||||
@@ -737,7 +778,7 @@ def handle_function_call(
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error executing {function_name}: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
logger.exception(error_msg)
|
||||
return json.dumps({"error": error_msg}, ensure_ascii=False)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user