From cd7150a195f328c38050b4a0de5ed6491d7792ed Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Tue, 28 Apr 2026 18:44:14 -0700 Subject: [PATCH] perf(approval): precompile DANGEROUS_PATTERNS and HARDLINE_PATTERNS (#17206) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit detect_dangerous_command() and detect_hardline_command() were calling re.search(pattern, text, re.IGNORECASE | re.DOTALL) inline — Python's re._cache (512 patterns) amortizes compile cost on the warm path, but: 1. The first terminal() call per process pays the full compile fan-out for all 59 patterns (12 HARDLINE + 47 DANGEROUS). Measured at ~2.6 ms per detect_dangerous_command() call after re.purge(). 2. The re._cache is LRU — unrelated regex work elsewhere in the agent (response parsing, text normalization, etc.) can evict our patterns and silently re-compile them on the next terminal() call. Precompiling at module load eliminates both costs: detect_dangerous_command: cold 2.613 ms → 0.298 ms (-88%) warm 0.042 ms → 0.004 ms (-90%) detect_hardline_command: cold ~0.6 ms → 0.006 ms warm 0.011 ms → 0.002 ms Savings are per terminal() call. Agents with heavy terminal use see compound savings; the bigger value is the stability guarantee (no re._cache eviction can silently re-introduce the 2.6 ms cold cost mid-session). Implementation: - HARDLINE_PATTERNS_COMPILED and DANGEROUS_PATTERNS_COMPILED built at module load from the existing (pattern, description) tuples, using shared _RE_FLAGS = re.IGNORECASE | re.DOTALL. - detect_* functions now iterate the compiled list and call pattern_re.search(text). - Original HARDLINE_PATTERNS and DANGEROUS_PATTERNS lists kept as-is (other code in the file uses them for key derivation / _PATTERN_KEY_ALIASES). Verified: - 160/161 tests/tools/test_approval*.py pass (1 pre-existing heartbeat test flake on main). - 349/349 tests/tools/ 'approval or terminal or dangerous' pass. - Live hermes chat smoke: 3 benign terminal commands + 1 rm -rf /tmp/ (clarify prompt fired — approval path still works) + 1 sudo (sudo password prompt fired — DANGEROUS pattern match still works). 23 log lines in the smoke window, zero errors. Co-authored-by: teknium1 --- tools/approval.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/tools/approval.py b/tools/approval.py index aac0406d5..5521ab5bc 100644 --- a/tools/approval.py +++ b/tools/approval.py @@ -164,6 +164,18 @@ HARDLINE_PATTERNS = [ (_CMDPOS + r'telinit\s+[06]\b', "telinit 0/6 (shutdown/reboot)"), ] +# Pre-compiled variant used by the hot-path matcher. Building these at module +# load eliminates the ~2.6 ms cold-cache re.compile fan-out on the first +# terminal() call per process (12 HARDLINE + 47 DANGEROUS patterns, each +# potentially evicted from Python's 512-entry ``re._cache`` by unrelated +# regex work elsewhere in the agent). DANGEROUS_PATTERNS_COMPILED is built +# at the end of this module after DANGEROUS_PATTERNS is defined. +_RE_FLAGS = re.IGNORECASE | re.DOTALL +HARDLINE_PATTERNS_COMPILED = [ + (re.compile(pattern, _RE_FLAGS), description) + for pattern, description in HARDLINE_PATTERNS +] + def detect_hardline_command(command: str) -> tuple: """Check if a command matches the unconditional hardline blocklist. @@ -172,8 +184,8 @@ def detect_hardline_command(command: str) -> tuple: (is_hardline, description) or (False, None) """ normalized = _normalize_command_for_detection(command).lower() - for pattern, description in HARDLINE_PATTERNS: - if re.search(pattern, normalized, re.IGNORECASE | re.DOTALL): + for pattern_re, description in HARDLINE_PATTERNS_COMPILED: + if pattern_re.search(normalized): return (True, description) return (False, None) @@ -267,6 +279,13 @@ DANGEROUS_PATTERNS = [ ] +# Pre-compiled variant (same rationale as HARDLINE_PATTERNS_COMPILED above). +DANGEROUS_PATTERNS_COMPILED = [ + (re.compile(pattern, _RE_FLAGS), description) + for pattern, description in DANGEROUS_PATTERNS +] + + def _legacy_pattern_key(pattern: str) -> str: """Reproduce the old regex-derived approval key for backwards compatibility.""" return pattern.split(r'\b')[1] if r'\b' in pattern else pattern[:20] @@ -319,8 +338,8 @@ def detect_dangerous_command(command: str) -> tuple: (is_dangerous, pattern_key, description) or (False, None, None) """ command_lower = _normalize_command_for_detection(command).lower() - for pattern, description in DANGEROUS_PATTERNS: - if re.search(pattern, command_lower, re.IGNORECASE | re.DOTALL): + for pattern_re, description in DANGEROUS_PATTERNS_COMPILED: + if pattern_re.search(command_lower): pattern_key = description return (True, pattern_key, description) return (False, None, None)