From cd7150a195f328c38050b4a0de5ed6491d7792ed Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Tue, 28 Apr 2026 18:44:14 -0700
Subject: [PATCH] perf(approval): precompile DANGEROUS_PATTERNS and
 HARDLINE_PATTERNS (#17206)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

detect_dangerous_command() and detect_hardline_command() were calling
re.search(pattern, text, re.IGNORECASE | re.DOTALL) inline — Python's
re._cache (512 patterns) amortizes compile cost on the warm path, but:

  1. The first terminal() call per process pays the full compile fan-out
     for all 59 patterns (12 HARDLINE + 47 DANGEROUS). Measured at
     ~2.6 ms per detect_dangerous_command() call after re.purge().
  2. The re._cache is LRU — unrelated regex work elsewhere in the agent
     (response parsing, text normalization, etc.) can evict our patterns
     and silently re-compile them on the next terminal() call.

Precompiling at module load eliminates both costs:

  detect_dangerous_command:
    cold  2.613 ms  →  0.298 ms   (-88%)
    warm  0.042 ms  →  0.004 ms   (-90%)
  detect_hardline_command:
    cold  ~0.6 ms   →  0.006 ms
    warm  0.011 ms  →  0.002 ms

Savings are per terminal() call. Agents with heavy terminal use see
compound savings; the bigger value is the stability guarantee (no
re._cache eviction can silently re-introduce the 2.6 ms cold cost
mid-session).

Implementation:
- HARDLINE_PATTERNS_COMPILED and DANGEROUS_PATTERNS_COMPILED built at
  module load from the existing (pattern, description) tuples, using
  shared _RE_FLAGS = re.IGNORECASE | re.DOTALL.
- detect_* functions now iterate the compiled list and call pattern_re.search(text).
- Original HARDLINE_PATTERNS and DANGEROUS_PATTERNS lists kept as-is
  (other code in the file uses them for key derivation /
  _PATTERN_KEY_ALIASES).

Verified:
- 160/161 tests/tools/test_approval*.py pass (1 pre-existing heartbeat
  test flake on main).
- 349/349 tests/tools/ 'approval or terminal or dangerous' pass.
- Live hermes chat smoke: 3 benign terminal commands + 1 rm -rf /tmp/
  (clarify prompt fired — approval path still works) + 1 sudo (sudo
  password prompt fired — DANGEROUS pattern match still works). 23
  log lines in the smoke window, zero errors.

Co-authored-by: teknium1 <teknium@users.noreply.github.com>
---
 tools/approval.py | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/tools/approval.py b/tools/approval.py
index aac0406d5..5521ab5bc 100644
--- a/tools/approval.py
+++ b/tools/approval.py
@@ -164,6 +164,18 @@ HARDLINE_PATTERNS = [
     (_CMDPOS + r'telinit\s+[06]\b', "telinit 0/6 (shutdown/reboot)"),
 ]
 
+# Pre-compiled variant used by the hot-path matcher. Building these at module
+# load eliminates the ~2.6 ms cold-cache re.compile fan-out on the first
+# terminal() call per process (12 HARDLINE + 47 DANGEROUS patterns, each
+# potentially evicted from Python's 512-entry ``re._cache`` by unrelated
+# regex work elsewhere in the agent). DANGEROUS_PATTERNS_COMPILED is built
+# at the end of this module after DANGEROUS_PATTERNS is defined.
+_RE_FLAGS = re.IGNORECASE | re.DOTALL
+HARDLINE_PATTERNS_COMPILED = [
+    (re.compile(pattern, _RE_FLAGS), description)
+    for pattern, description in HARDLINE_PATTERNS
+]
+
 
 def detect_hardline_command(command: str) -> tuple:
     """Check if a command matches the unconditional hardline blocklist.
@@ -172,8 +184,8 @@ def detect_hardline_command(command: str) -> tuple:
         (is_hardline, description) or (False, None)
     """
     normalized = _normalize_command_for_detection(command).lower()
-    for pattern, description in HARDLINE_PATTERNS:
-        if re.search(pattern, normalized, re.IGNORECASE | re.DOTALL):
+    for pattern_re, description in HARDLINE_PATTERNS_COMPILED:
+        if pattern_re.search(normalized):
             return (True, description)
     return (False, None)
 
@@ -267,6 +279,13 @@ DANGEROUS_PATTERNS = [
 ]
 
 
+# Pre-compiled variant (same rationale as HARDLINE_PATTERNS_COMPILED above).
+DANGEROUS_PATTERNS_COMPILED = [
+    (re.compile(pattern, _RE_FLAGS), description)
+    for pattern, description in DANGEROUS_PATTERNS
+]
+
+
 def _legacy_pattern_key(pattern: str) -> str:
     """Reproduce the old regex-derived approval key for backwards compatibility."""
     return pattern.split(r'\b')[1] if r'\b' in pattern else pattern[:20]
@@ -319,8 +338,8 @@ def detect_dangerous_command(command: str) -> tuple:
         (is_dangerous, pattern_key, description) or (False, None, None)
     """
     command_lower = _normalize_command_for_detection(command).lower()
-    for pattern, description in DANGEROUS_PATTERNS:
-        if re.search(pattern, command_lower, re.IGNORECASE | re.DOTALL):
+    for pattern_re, description in DANGEROUS_PATTERNS_COMPILED:
+        if pattern_re.search(command_lower):
             pattern_key = description
             return (True, pattern_key, description)
     return (False, None, None)