From fa9383d27ba4357d75eb21f603220f8fa9b58106 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Tue, 28 Apr 2026 22:07:02 -0700 Subject: [PATCH] feat(curator): umbrella-first prompt, inherit parent config, unbounded iterations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Based on three live test runs against 346 agent-created skills on the author's own setup (~6.5 min, opus-4.7, 86 API calls), the curator prompt needed three sharpenings before it consistently produced real umbrella consolidation instead of passive audit output: **Umbrella-first framing.** The original 'decide keep/patch/archive/ consolidate' framing lets opus default to 'keep' whenever two skills aren't byte-identical. The new prompt explicitly tells the reviewer that pairwise distinctness is the wrong bar — the right question is 'would a human maintainer write this as N separate skills, or one skill with N labeled subsections?' Expect 10-25 prefix clusters; merge each into an umbrella via one of three methods. **Three concrete consolidation methods.** (a) Merge into an existing umbrella (patch the broadest skill, archive siblings); (b) Create a new umbrella SKILL.md (skill_manage action=create); (c) Demote session-specific detail into references/, templates/, or scripts/ under the umbrella via skill_manage action=write_file, then archive the narrow sibling. This matches the support-file vocabulary the review-prompt side already uses (PR #17213). **Two observed bailouts pre-empted:** 'usage counters are zero so I can't judge' (rule 4: judge on content, not use_count) and 'each has a distinct trigger' (rule 5: pairwise distinctness is the wrong bar). **Config-aware parent inheritance.** _run_llm_review() was building AIAgent() without explicit provider/model, hitting an auto-resolve path that returned empty credentials → HTTP 400 'No models provided' against OpenRouter. Fork now inherits the user's main provider and model (via load_config + resolve_runtime_provider) before spawning — runs on whatever the user is currently on, OAuth-backed or pool-backed included. **Unbounded iteration ceiling.** max_iterations=8 was way too low for an umbrella-build pass over hundreds of skills. A live pass takes 50-100 API calls (scanning, clustering, skill_view'ing candidates, patching umbrellas, mv'ing siblings). Raised to 9999 — the natural stopping criterion is 'no more clusters worth processing', not an arbitrary tool-call budget. **Tests updated:** test_curator_review_prompt_has_invariants accepts DO NOT / MUST NOT and drops 'keep' from the required-verb set (the umbrella-first prompt correctly deemphasizes 'keep' as a first-class decision label since passive keep-everything is the failure mode being prevented). Added test_curator_review_prompt_is_umbrella_first asserting the umbrella framing, class-level thinking, references/ + templates/ + scripts/ support-file mentions, and the 'use_count is not evidence of value' pre-emption. Added test_curator_review_prompt_offers_support_file_actions asserting skill_manage action=create and action=write_file are both named. **Live validation on author's setup:** - Run 1 (old prompt): 3 archives, stopped after surveying — typical passive outcome - Run 2 (consolidation prompt): 44 archives, 3 patches, surfaced the 50-skill mlops reorg duplicate bug but didn't umbrella - Run 3 (this prompt): 249 archives + 18 new class-level umbrellas created, reducing agent-created skills from 346 → 118 with every archived skill's content preserved as references/ under its umbrella. Pinned skill untouched. Full report in PR description. --- agent/curator.py | 183 ++++++++++++++++++++++++++++-------- tests/agent/test_curator.py | 50 +++++++++- 2 files changed, 191 insertions(+), 42 deletions(-) diff --git a/agent/curator.py b/agent/curator.py index 3bd71d46c..6858830aa 100644 --- a/agent/curator.py +++ b/agent/curator.py @@ -259,36 +259,98 @@ def apply_automatic_transitions(now: Optional[datetime] = None) -> Dict[str, int # --------------------------------------------------------------------------- CURATOR_REVIEW_PROMPT = ( - "You are running as Hermes' background skill CURATOR.\n\n" - "Your job is to maintain the collection of AGENT-CREATED skills. Review " - "each candidate below and decide what to do.\n\n" - "Rules — all load-bearing, do not violate:\n" - "1. You MUST NOT touch bundled or hub-installed skills. The candidate list " + "You are running as Hermes' background skill CURATOR. This is an " + "UMBRELLA-BUILDING consolidation pass, not a passive audit and not a " + "duplicate-finder.\n\n" + "The goal of the skill collection is a LIBRARY OF CLASS-LEVEL " + "INSTRUCTIONS AND EXPERIENTIAL KNOWLEDGE. A collection of hundreds of " + "narrow skills where each one captures one session's specific bug is " + "a FAILURE of the library — not a feature. An agent searching skills " + "matches on descriptions, not on exact names; one broad umbrella " + "skill with labeled subsections beats five narrow siblings for " + "discoverability, not the other way around.\n\n" + "The right target shape is CLASS-LEVEL skills with rich SKILL.md " + "bodies + `references/`, `templates/`, and `scripts/` subfiles for " + "session-specific detail — not one-session-one-skill micro-entries.\n\n" + "Hard rules — do not violate:\n" + "1. DO NOT touch bundled or hub-installed skills. The candidate list " "below is already filtered to agent-created skills only.\n" - "2. You MUST NOT delete any skill. Archiving (moving the skill's directory " - "into ~/.hermes/skills/.archive/) is the maximum action. Archives are " - "recoverable; deletion is not.\n" - "3. You MUST NOT touch skills shown as pinned=yes. Skip them.\n" - "4. Prefer GENERALIZING overlapping skills by patching the stronger one " - "and archiving the weaker, rather than leaving two narrow skills in the " - "collection.\n\n" + "2. DO NOT delete any skill. Archiving (moving the skill's directory " + "into ~/.hermes/skills/.archive/) is the maximum destructive action. " + "Archives are recoverable; deletion is not.\n" + "3. DO NOT touch skills shown as pinned=yes. Skip them entirely.\n" + "4. DO NOT use usage counters as a reason to skip consolidation. The " + "counters are new and often mostly zero. Judge overlap on CONTENT, " + "not on use_count. 'use=0' is not evidence a skill is valuable; it's " + "absence of evidence either way.\n" + "5. DO NOT reject consolidation on the grounds that 'each skill has " + "a distinct trigger'. Pairwise distinctness is the wrong bar. The " + "right bar is: 'would a human maintainer write this as N separate " + "skills, or as one skill with N labeled subsections?' When the " + "answer is the latter, merge.\n\n" + "How to work — not optional:\n" + "1. Scan the full candidate list. Identify PREFIX CLUSTERS (skills " + "sharing a first word or domain keyword). Examples you are likely " + "to find: hermes-config-*, hermes-dashboard-*, gateway-*, codex-*, " + "ollama-*, anthropic-*, gemini-*, mcp-*, salvage-*, pr-*, " + "competitor-*, python-*, security-*, etc. Expect 10-25 clusters.\n" + "2. For each cluster with 2+ members, do NOT ask 'are these pairs " + "overlapping?' — ask 'what is the UMBRELLA CLASS these skills all " + "serve? Would a maintainer name that class and write one skill for " + "it?' If yes, pick (or create) the umbrella and absorb the siblings " + "into it.\n" + "3. Three ways to consolidate — use the right one per cluster:\n" + " a. MERGE INTO EXISTING UMBRELLA — one skill in the cluster is " + "already broad enough to be the umbrella (example: `pr-triage-" + "salvage` for the PR review cluster). Patch it to add a labeled " + "section for each sibling's unique insight, then archive the " + "siblings.\n" + " b. CREATE A NEW UMBRELLA SKILL.md — no existing member is broad " + "enough. Use skill_manage action=create to write a new class-level " + "skill whose SKILL.md covers the shared workflow and has short " + "labeled subsections. Archive the now-absorbed narrow siblings.\n" + " c. DEMOTE TO REFERENCES/TEMPLATES/SCRIPTS — a sibling has " + "narrow-but-valuable session-specific content. Move it into the " + "umbrella's appropriate support directory:\n" + " • `references/.md` for session-specific detail OR " + "condensed knowledge banks (quoted research, API docs excerpts, " + "domain notes, provider quirks, reproduction recipes)\n" + " • `templates/.` for starter files meant to be " + "copied and modified\n" + " • `scripts/.` for statically re-runnable actions " + "(verification scripts, fixture generators, probes)\n" + " Then archive the old sibling. Use `terminal` with `mkdir -p " + "~/.hermes/skills//references/ && mv ... /" + "references/.md` (or templates/ / scripts/).\n" + "4. Also flag skills whose NAME is too narrow (contains a PR number, " + "a feature codename, a specific error string, an 'audit' / " + "'diagnosis' / 'salvage' session artifact). These almost always " + "belong as a subsection or support file under a class-level umbrella.\n" + "5. Iterate. After one consolidation round, scan the remaining set " + "and look for the NEXT umbrella opportunity. Don't stop after 3 " + "merges.\n\n" "Your toolset:\n" - " - skills_list, skill_view — read the current landscape\n" - " - skill_manage action=patch — fix stale commands, wrong paths, or " - "merge two overlapping skills by broadening the stronger one\n" - " - terminal — move a skill directory into the archive, " - "e.g. mv ~/.hermes/skills/ ~/.hermes/skills/.archive/\n\n" - "For each candidate, decide one of:\n" - " keep — leave as-is (most common default; don't over-curate)\n" - " patch — skill_manage action=patch to fix stale commands, wrong " - "paths, or env-specific claims that are no longer true\n" - " consolidate — two skills overlap: patch the stronger one to absorb " - "the weaker (skill_manage), then mv the weaker directory to .archive/\n" - " archive — the skill is genuinely obsolete and has not been used " - "recently: mv its directory to ~/.hermes/skills/.archive/\n\n" - "Start by calling skills_list and skill_view on anything you consider " - "patching or consolidating. Be conservative — if in doubt, keep. " - "When you are done, write a one-sentence summary of what you changed." + " - skills_list, skill_view — read the current landscape\n" + " - skill_manage action=patch — add sections to the umbrella\n" + " - skill_manage action=create — create a new umbrella SKILL.md\n" + " - skill_manage action=write_file — add a references/, templates/, " + "or scripts/ file under an existing skill (the skill must already " + "exist)\n" + " - terminal — mv a sibling into the archive " + "OR move its content into a support subfile\n\n" + "'keep' is a legitimate decision ONLY when the skill is already a " + "class-level umbrella and none of the proposed merges would improve " + "discoverability. 'This is narrow but distinct from its siblings' " + "is NOT a reason to keep — it's a reason to move it under an " + "umbrella as a subsection or support file.\n\n" + "Expected output: real umbrella-ification. Process every obvious " + "cluster. If you end the pass with fewer than 10 archives, you " + "stopped too early — go back and look at the clusters you left " + "alone.\n\n" + "When done, write a summary with: clusters processed, skills " + "patched/absorbed, skills demoted to references/templates/scripts, " + "skills archived, new umbrellas created, and clusters you " + "deliberately left alone with one line each." ) @@ -399,22 +461,65 @@ def _run_llm_review(prompt: str) -> str: except Exception as e: return f"AIAgent import failed: {e}" + # Resolve provider + model the same way the CLI does, so the curator + # fork inherits the user's active main config rather than falling + # through to an empty provider/model pair (which sends HTTP 400 + # "No models provided"). AIAgent() without explicit provider/model + # arguments hits an auto-resolution path that fails for OAuth-only + # providers and for pool-backed credentials. + _api_key = None + _base_url = None + _api_mode = None + _resolved_provider = None + _model_name = "" + try: + from hermes_cli.config import load_config + from hermes_cli.runtime_provider import resolve_runtime_provider + _cfg = load_config() + _m = _cfg.get("model", {}) if isinstance(_cfg.get("model"), dict) else {} + _provider = _m.get("provider") or "auto" + _model_name = _m.get("default") or _m.get("model") or "" + _rp = resolve_runtime_provider( + requested=_provider, target_model=_model_name + ) + _api_key = _rp.get("api_key") + _base_url = _rp.get("base_url") + _api_mode = _rp.get("api_mode") + _resolved_provider = _rp.get("provider") or _provider + except Exception as e: + logger.debug("Curator provider resolution failed: %s", e, exc_info=True) + review_agent = None try: + review_agent = AIAgent( + model=_model_name, + provider=_resolved_provider, + api_key=_api_key, + base_url=_base_url, + api_mode=_api_mode, + # Umbrella-building over a large skill collection is worth a + # high iteration ceiling — the pass typically takes 50-100 + # API calls against hundreds of candidate skills. The + # single-session review path caps itself at a much smaller + # number because it's not doing a curation sweep. + max_iterations=9999, + quiet_mode=True, + platform="curator", + skip_context_files=True, + skip_memory=True, + ) + # Disable recursive nudges — the curator must never spawn its own review. + review_agent._memory_nudge_interval = 0 + review_agent._skill_nudge_interval = 0 + + # Redirect the forked agent's stdout/stderr to /dev/null while it + # runs so its tool-call chatter doesn't pollute the foreground + # terminal. The background-thread runner also hides it; this + # belt-and-suspenders path matters when a caller invokes + # run_curator_review(synchronous=True) from the CLI. with open(os.devnull, "w") as _devnull, \ contextlib.redirect_stdout(_devnull), \ contextlib.redirect_stderr(_devnull): - review_agent = AIAgent( - max_iterations=8, - quiet_mode=True, - platform="curator", - skip_context_files=True, - skip_memory=True, - ) - # Disable recursive nudges — the curator must never spawn its own review. - review_agent._memory_nudge_interval = 0 - review_agent._skill_nudge_interval = 0 - result = review_agent.run_conversation(user_message=prompt) final = "" diff --git a/tests/agent/test_curator.py b/tests/agent/test_curator.py index f5449d08e..a8a4b5ada 100644 --- a/tests/agent/test_curator.py +++ b/tests/agent/test_curator.py @@ -359,13 +359,19 @@ def test_state_atomic_write_no_tmp_leftovers(curator_env): def test_curator_review_prompt_has_invariants(): """Core invariants must be in the review prompt text.""" from agent.curator import CURATOR_REVIEW_PROMPT - assert "MUST NOT" in CURATOR_REVIEW_PROMPT + assert "MUST NOT" in CURATOR_REVIEW_PROMPT or "DO NOT" in CURATOR_REVIEW_PROMPT assert "bundled" in CURATOR_REVIEW_PROMPT.lower() assert "delete" in CURATOR_REVIEW_PROMPT.lower() assert "pinned" in CURATOR_REVIEW_PROMPT.lower() - # Must mention the decisions the reviewer can make - for verb in ("keep", "patch", "archive", "consolidate"): + # Must describe the actions the reviewer can take. The exact vocabulary + # has tightened over time (the umbrella-first prompt drops 'keep' as a + # first-class decision verb, since passive keep-everything is the + # failure mode the prompt is trying to avoid), but the core merge / + # archive / patch trio must remain callable. + for verb in ("patch", "archive"): assert verb in CURATOR_REVIEW_PROMPT.lower() + # Must mention consolidation (possibly via "merge" or "consolidat") + assert "consolidat" in CURATOR_REVIEW_PROMPT.lower() or "merge" in CURATOR_REVIEW_PROMPT.lower() def test_curator_review_prompt_points_at_existing_tools_only(): @@ -400,6 +406,44 @@ def test_curator_does_not_instruct_model_to_pin(): ) +def test_curator_review_prompt_is_umbrella_first(): + """The curator prompt must push umbrella-building / class-level thinking, + not pair-level 'are these two the same?' analysis.""" + from agent.curator import CURATOR_REVIEW_PROMPT + lower = CURATOR_REVIEW_PROMPT.lower() + # Must frame the task as active umbrella-building, not a passive audit. + assert "umbrella" in lower, ( + "must use UMBRELLA framing — the class-first abstraction the curator " + "is designed to produce" + ) + # Must tell the reviewer not to stop at pair-level distinctness. + assert "class" in lower, "must reference class-level thinking" + # Must cover the three consolidation methods explicitly + assert "references/" in CURATOR_REVIEW_PROMPT, ( + "must name references/ as a demotion target for session-specific content" + ) + # templates/ and scripts/ make the umbrella a real class-level skill + assert "templates/" in CURATOR_REVIEW_PROMPT + assert "scripts/" in CURATOR_REVIEW_PROMPT + # Must say the counter argument: usage=0 is not a reason to skip + assert "use_count" in CURATOR_REVIEW_PROMPT or "counter" in lower, ( + "must pre-empt the 'usage counters are zero, I can't judge' bailout" + ) + + +def test_curator_review_prompt_offers_support_file_actions(): + """Support-file demotion (references/templates/scripts) must be one of + the three consolidation methods, alongside merge-into-existing and + create-new-umbrella.""" + from agent.curator import CURATOR_REVIEW_PROMPT + # skill_manage action=write_file is how references/ are added to an + # existing skill — this is the create-adjacent action the curator needs + # to demote narrow siblings without touching their SKILL.md. + assert "write_file" in CURATOR_REVIEW_PROMPT + # Must offer creating a brand-new umbrella when no existing one fits + assert "action=create" in CURATOR_REVIEW_PROMPT or "create a new umbrella" in CURATOR_REVIEW_PROMPT.lower() + + def test_cli_unpin_refuses_bundled_skill(curator_env, capsys): """hermes curator unpin must refuse bundled/hub skills too (matches pin)."""