perf: fix O(n²) catastrophic backtracking in redact regex + reorder file read guard

Two pre-existing issues causing test_file_read_guards timeouts on CI: 1. agent/redact.py: _ENV_ASSIGN_RE used unbounded [A-Z_]* with IGNORECASE, matching any letter/underscore to end-of-string at each position → O(n²) backtracking on 100K+ char inputs. Bounded to {0,50} since env var names are never that long. 2. tools/file_tools.py: redact_sensitive_text() ran BEFORE the character-count guard, so oversized content (that would be rejected anyway) went through the expensive regex first. Reordered to check size limit before redaction.
2026-04-03 16:25:35 +02:00
parent 1c0c5d957f
commit 831067c5d3
2 changed files with 7 additions and 3 deletions
--- a/agent/redact.py
+++ b/agent/redact.py
@@ -53,7 +53,7 @@ _PREFIX_PATTERNS = [
 # ENV assignment patterns: KEY=value where KEY contains a secret-like name
 _SECRET_ENV_NAMES = r"(?:API_?KEY|TOKEN|SECRET|PASSWORD|PASSWD|CREDENTIAL|AUTH)"
 _ENV_ASSIGN_RE = re.compile(
-    rf"([A-Z_]*{_SECRET_ENV_NAMES}[A-Z_]*)\s*=\s*(['\"]?)(\S+)\2",
+    rf"([A-Z_]{{0,50}}{_SECRET_ENV_NAMES}[A-Z_]{{0,50}})\s*=\s*(['\"]?)(\S+)\2",
    re.IGNORECASE,
 )

--- a/tools/file_tools.py
+++ b/tools/file_tools.py
@@ -345,8 +345,6 @@ def read_file_tool(path: str, offset: int = 1, limit: int = 500, task_id: str =
        # ── Perform the read ──────────────────────────────────────────
        file_ops = _get_file_ops(task_id)
        result = file_ops.read_file(path, offset, limit)
-        if result.content:
-            result.content = redact_sensitive_text(result.content)
        result_dict = result.to_dict()

        # ── Character-count guard ─────────────────────────────────────
@@ -355,6 +353,7 @@ def read_file_tool(path: str, offset: int = 1, limit: int = 500, task_id: str =
        # amount of content, reject it and tell the model to narrow down.
        # Note: we check the formatted content (with line-number prefixes),
        # not the raw file size, because that's what actually enters context.
+        # Check BEFORE redaction to avoid expensive regex on huge content.
        content_len = len(result.content or "")
        file_size = result_dict.get("file_size", 0)
        max_chars = _get_max_read_chars()
@@ -372,6 +371,11 @@ def read_file_tool(path: str, offset: int = 1, limit: int = 500, task_id: str =
                "file_size": file_size,
            }, ensure_ascii=False)

+        # ── Redact secrets (after guard check to skip oversized content) ──
+        if result.content:
+            result.content = redact_sensitive_text(result.content)
+            result_dict["content"] = result.content
+
        # Large-file hint: if the file is big and the caller didn't ask
        # for a narrow window, nudge toward targeted reads.
        if (file_size and file_size > _LARGE_FILE_HINT_BYTES