Merge branch 'main' of github.com:NousResearch/hermes-agent into bb/tui-audit-followup
This commit is contained in:
@@ -782,15 +782,6 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||
from hermes_cli.models import copilot_default_headers
|
||||
|
||||
extra["default_headers"] = copilot_default_headers()
|
||||
elif "generativelanguage.googleapis.com" in base_url.lower():
|
||||
# Google's OpenAI-compatible endpoint only accepts x-goog-api-key.
|
||||
# Passing api_key= causes the SDK to inject Authorization: Bearer,
|
||||
# which Google rejects with HTTP 400 "Multiple authentication
|
||||
# credentials received". Use a placeholder for api_key and pass
|
||||
# the real key via x-goog-api-key header instead.
|
||||
# Fixes: https://github.com/NousResearch/hermes-agent/issues/7893
|
||||
extra["default_headers"] = {"x-goog-api-key": api_key}
|
||||
api_key = "not-used"
|
||||
return OpenAI(api_key=api_key, base_url=base_url, **extra), model
|
||||
|
||||
creds = resolve_api_key_provider_credentials(provider_id)
|
||||
@@ -812,15 +803,6 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||
from hermes_cli.models import copilot_default_headers
|
||||
|
||||
extra["default_headers"] = copilot_default_headers()
|
||||
elif "generativelanguage.googleapis.com" in base_url.lower():
|
||||
# Google's OpenAI-compatible endpoint only accepts x-goog-api-key.
|
||||
# Passing api_key= causes the SDK to inject Authorization: Bearer,
|
||||
# which Google rejects with HTTP 400 "Multiple authentication
|
||||
# credentials received". Use a placeholder for api_key and pass
|
||||
# the real key via x-goog-api-key header instead.
|
||||
# Fixes: https://github.com/NousResearch/hermes-agent/issues/7893
|
||||
extra["default_headers"] = {"x-goog-api-key": api_key}
|
||||
api_key = "not-used"
|
||||
return OpenAI(api_key=api_key, base_url=base_url, **extra), model
|
||||
|
||||
return None, None
|
||||
@@ -1666,16 +1648,6 @@ def resolve_provider_client(
|
||||
from hermes_cli.models import copilot_default_headers
|
||||
|
||||
headers.update(copilot_default_headers())
|
||||
elif "generativelanguage.googleapis.com" in base_url.lower():
|
||||
# Google's OpenAI-compatible endpoint only accepts x-goog-api-key.
|
||||
# Passing api_key= causes the OpenAI SDK to inject Authorization: Bearer,
|
||||
# which Google rejects with HTTP 400 "Multiple authentication credentials
|
||||
# received". Use a placeholder for api_key and pass the real key via
|
||||
# x-goog-api-key header instead.
|
||||
# Fixes: https://github.com/NousResearch/hermes-agent/issues/7893
|
||||
headers["x-goog-api-key"] = api_key
|
||||
api_key = "not-used"
|
||||
|
||||
client = OpenAI(api_key=api_key, base_url=base_url,
|
||||
**({"default_headers": headers} if headers else {}))
|
||||
|
||||
|
||||
@@ -420,7 +420,10 @@ def list_provider_models(provider: str) -> List[str]:
|
||||
models = _get_provider_models(provider)
|
||||
if models is None:
|
||||
return []
|
||||
return list(models.keys())
|
||||
return [
|
||||
mid for mid in models.keys()
|
||||
if not _should_hide_from_provider_catalog(provider, mid)
|
||||
]
|
||||
|
||||
|
||||
# Patterns that indicate non-agentic or noise models (TTS, embedding,
|
||||
@@ -432,6 +435,43 @@ _NOISE_PATTERNS: re.Pattern = re.compile(
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Google's live Gemini catalogs currently include a mix of stale slugs and
|
||||
# Gemma models whose TPM quotas are too small for normal Hermes agent traffic.
|
||||
# Keep capability metadata available for direct/manual use, but hide these from
|
||||
# the Gemini model catalogs we surface in setup and model selection.
|
||||
_GOOGLE_HIDDEN_MODELS = frozenset({
|
||||
# Low-TPM Gemma models that trip Google input-token quota walls under
|
||||
# agent-style traffic despite advertising large context windows.
|
||||
"gemma-4-31b-it",
|
||||
"gemma-4-26b-it",
|
||||
"gemma-4-26b-a4b-it",
|
||||
"gemma-3-1b",
|
||||
"gemma-3-1b-it",
|
||||
"gemma-3-2b",
|
||||
"gemma-3-2b-it",
|
||||
"gemma-3-4b",
|
||||
"gemma-3-4b-it",
|
||||
"gemma-3-12b",
|
||||
"gemma-3-12b-it",
|
||||
"gemma-3-27b",
|
||||
"gemma-3-27b-it",
|
||||
# Stale/retired Google slugs that still surface through models.dev-backed
|
||||
# Gemini selection but 404 on the current Google endpoints.
|
||||
"gemini-1.5-flash",
|
||||
"gemini-1.5-pro",
|
||||
"gemini-1.5-flash-8b",
|
||||
"gemini-2.0-flash",
|
||||
"gemini-2.0-flash-lite",
|
||||
})
|
||||
|
||||
|
||||
def _should_hide_from_provider_catalog(provider: str, model_id: str) -> bool:
|
||||
provider_lower = (provider or "").strip().lower()
|
||||
model_lower = (model_id or "").strip().lower()
|
||||
if provider_lower in {"gemini", "google"} and model_lower in _GOOGLE_HIDDEN_MODELS:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def list_agentic_models(provider: str) -> List[str]:
|
||||
"""Return model IDs suitable for agentic use from models.dev.
|
||||
@@ -448,6 +488,8 @@ def list_agentic_models(provider: str) -> List[str]:
|
||||
for mid, entry in models.items():
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
if _should_hide_from_provider_catalog(provider, mid):
|
||||
continue
|
||||
if not entry.get("tool_call", False):
|
||||
continue
|
||||
if _NOISE_PATTERNS.search(mid):
|
||||
@@ -582,5 +624,3 @@ def get_model_info(
|
||||
return _parse_model_info(mid, mdata, mdev_id)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
||||
@@ -133,8 +133,6 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
|
||||
"gemini-2.5-pro",
|
||||
"gemini-2.5-flash",
|
||||
"gemini-2.5-flash-lite",
|
||||
# Gemma open models (also served via AI Studio)
|
||||
"gemma-4-31b-it",
|
||||
],
|
||||
"google-gemini-cli": [
|
||||
"gemini-2.5-pro",
|
||||
|
||||
@@ -91,7 +91,6 @@ _DEFAULT_PROVIDER_MODELS = {
|
||||
"gemini": [
|
||||
"gemini-3.1-pro-preview", "gemini-3-flash-preview", "gemini-3.1-flash-lite-preview",
|
||||
"gemini-2.5-pro", "gemini-2.5-flash", "gemini-2.5-flash-lite",
|
||||
"gemma-4-31b-it",
|
||||
],
|
||||
"zai": ["glm-5.1", "glm-5", "glm-4.7", "glm-4.5", "glm-4.5-flash"],
|
||||
"kimi-coding": ["kimi-k2.5", "kimi-k2-thinking", "kimi-k2-turbo-preview"],
|
||||
|
||||
51
run_agent.py
51
run_agent.py
@@ -1054,16 +1054,6 @@ class AIAgent:
|
||||
}
|
||||
elif "portal.qwen.ai" in effective_base.lower():
|
||||
client_kwargs["default_headers"] = _qwen_portal_headers()
|
||||
elif "generativelanguage.googleapis.com" in effective_base.lower():
|
||||
# Google's OpenAI-compatible endpoint only accepts x-goog-api-key.
|
||||
# The OpenAI SDK auto-injects Authorization: Bearer when api_key= is
|
||||
# set to a real value, causing HTTP 400 "Multiple authentication
|
||||
# credentials received". Pass a placeholder so the SDK does not
|
||||
# emit Bearer, and carry the real key via x-goog-api-key instead.
|
||||
# Fixes: https://github.com/NousResearch/hermes-agent/issues/7893
|
||||
real_key = client_kwargs["api_key"]
|
||||
client_kwargs["api_key"] = "not-used"
|
||||
client_kwargs["default_headers"] = {"x-goog-api-key": real_key}
|
||||
else:
|
||||
# No explicit creds — use the centralized provider router
|
||||
from agent.auxiliary_client import resolve_provider_client
|
||||
@@ -5245,17 +5235,6 @@ class AIAgent:
|
||||
self._client_kwargs["default_headers"] = {"User-Agent": "KimiCLI/1.30.0"}
|
||||
elif "portal.qwen.ai" in normalized:
|
||||
self._client_kwargs["default_headers"] = _qwen_portal_headers()
|
||||
elif "generativelanguage.googleapis.com" in normalized:
|
||||
# Google's endpoint rejects Bearer tokens; use x-goog-api-key instead.
|
||||
# Swap the real key out of api_key and into the header so the OpenAI
|
||||
# SDK does not emit Authorization: Bearer.
|
||||
# Fixes: https://github.com/NousResearch/hermes-agent/issues/7893
|
||||
real_key = self._client_kwargs.get("api_key", "")
|
||||
if real_key and real_key != "not-used":
|
||||
self._client_kwargs["api_key"] = "not-used"
|
||||
self._client_kwargs["default_headers"] = {
|
||||
"x-goog-api-key": real_key or self._client_kwargs.get("api_key", ""),
|
||||
}
|
||||
else:
|
||||
self._client_kwargs.pop("default_headers", None)
|
||||
|
||||
@@ -5868,7 +5847,15 @@ class AIAgent:
|
||||
entry["id"] = tc_delta.id
|
||||
if tc_delta.function:
|
||||
if tc_delta.function.name:
|
||||
entry["function"]["name"] += tc_delta.function.name
|
||||
# Use assignment, not +=. Function names are
|
||||
# atomic identifiers delivered complete in the
|
||||
# first chunk (OpenAI spec). Some providers
|
||||
# (MiniMax M2.7 via NVIDIA NIM) resend the full
|
||||
# name in every chunk; concatenation would
|
||||
# produce "read_fileread_file". Assignment
|
||||
# (matching the OpenAI Node SDK / LiteLLM /
|
||||
# Vercel AI patterns) is immune to this.
|
||||
entry["function"]["name"] = tc_delta.function.name
|
||||
if tc_delta.function.arguments:
|
||||
entry["function"]["arguments"] += tc_delta.function.arguments
|
||||
extra = getattr(tc_delta, "extra_content", None)
|
||||
@@ -7053,8 +7040,20 @@ class AIAgent:
|
||||
if self.tools:
|
||||
api_kwargs["tools"] = self.tools
|
||||
|
||||
if self.max_tokens is not None:
|
||||
# ── max_tokens for chat_completions ──────────────────────────────
|
||||
# Priority: ephemeral override (error recovery / length-continuation
|
||||
# boost) > user-configured max_tokens > provider-specific defaults.
|
||||
_ephemeral_out = getattr(self, "_ephemeral_max_output_tokens", None)
|
||||
if _ephemeral_out is not None:
|
||||
self._ephemeral_max_output_tokens = None # consume immediately
|
||||
api_kwargs.update(self._max_tokens_param(_ephemeral_out))
|
||||
elif self.max_tokens is not None:
|
||||
api_kwargs.update(self._max_tokens_param(self.max_tokens))
|
||||
elif "integrate.api.nvidia.com" in self._base_url_lower:
|
||||
# NVIDIA NIM defaults to a very low max_tokens when omitted,
|
||||
# causing models like GLM-4.7 to truncate immediately (thinking
|
||||
# tokens alone exhaust the budget). 16384 provides adequate room.
|
||||
api_kwargs.update(self._max_tokens_param(16384))
|
||||
elif self._is_qwen_portal():
|
||||
# Qwen Portal defaults to a very low max_tokens when omitted.
|
||||
# Reasoning models (qwen3-coder-plus) exhaust that budget on
|
||||
@@ -10796,6 +10795,12 @@ class AIAgent:
|
||||
continue
|
||||
|
||||
if restart_with_length_continuation:
|
||||
# Progressively boost the output token budget on each retry.
|
||||
# Retry 1 → 2× base, retry 2 → 3× base, capped at 32 768.
|
||||
# Applies to all providers via _ephemeral_max_output_tokens.
|
||||
_boost_base = self.max_tokens if self.max_tokens else 4096
|
||||
_boost = _boost_base * (length_continue_retries + 1)
|
||||
self._ephemeral_max_output_tokens = min(_boost, 32768)
|
||||
continue
|
||||
|
||||
# Guard: if all retries exhausted without a successful response
|
||||
|
||||
@@ -266,6 +266,8 @@ AUTHOR_MAP = {
|
||||
"limkuan24@gmail.com": "WideLee",
|
||||
"aviralarora002@gmail.com": "AviArora02-commits",
|
||||
"junminliu@gmail.com": "JimLiu",
|
||||
"jarvischer@gmail.com": "maxchernin",
|
||||
"levantam.98.2324@gmail.com": "LVT382009",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -130,7 +130,7 @@ class TestGeminiModelCatalog:
|
||||
models = _PROVIDER_MODELS["gemini"]
|
||||
assert "gemini-2.5-pro" in models
|
||||
assert "gemini-2.5-flash" in models
|
||||
assert "gemma-4-31b-it" in models
|
||||
assert "gemma-4-31b-it" not in models
|
||||
|
||||
def test_provider_models_has_3x(self):
|
||||
models = _PROVIDER_MODELS["gemini"]
|
||||
@@ -207,14 +207,8 @@ class TestGeminiAgentInit:
|
||||
assert agent.api_mode == "chat_completions"
|
||||
assert agent.provider == "gemini"
|
||||
|
||||
def test_gemini_uses_x_goog_api_key_not_bearer(self, monkeypatch):
|
||||
"""Regression test for issue #7893.
|
||||
|
||||
When provider=gemini, the OpenAI client must be constructed with
|
||||
api_key='not-used' and default_headers={'x-goog-api-key': real_key}.
|
||||
This prevents the SDK from injecting Authorization: Bearer, which
|
||||
Google's endpoint rejects with HTTP 400.
|
||||
"""
|
||||
def test_gemini_uses_bearer_auth(self, monkeypatch):
|
||||
"""Gemini OpenAI-compatible endpoint should receive the real API key."""
|
||||
monkeypatch.setenv("GOOGLE_API_KEY", "AIzaSy_REAL_KEY")
|
||||
real_key = "AIzaSy_REAL_KEY"
|
||||
with patch("run_agent.OpenAI") as mock_openai:
|
||||
@@ -227,37 +221,22 @@ class TestGeminiAgentInit:
|
||||
base_url="https://generativelanguage.googleapis.com/v1beta/openai",
|
||||
)
|
||||
call_kwargs = mock_openai.call_args[1]
|
||||
# The SDK must NOT receive the real key as api_key (which would emit Bearer)
|
||||
assert call_kwargs.get("api_key") == "not-used", (
|
||||
"api_key must be 'not-used' to suppress Authorization: Bearer for Gemini"
|
||||
)
|
||||
# The real key must be in x-goog-api-key header
|
||||
assert call_kwargs.get("api_key") == real_key
|
||||
headers = call_kwargs.get("default_headers", {})
|
||||
assert headers.get("x-goog-api-key") == real_key, (
|
||||
"x-goog-api-key header must carry the real Gemini API key"
|
||||
)
|
||||
assert "x-goog-api-key" not in headers
|
||||
|
||||
def test_gemini_resolve_provider_client_auth(self, monkeypatch):
|
||||
"""Regression test for issue #7893 — resolve_provider_client path.
|
||||
|
||||
When resolve_provider_client('gemini') is called, the returned OpenAI
|
||||
client must use x-goog-api-key header, not Authorization: Bearer.
|
||||
"""
|
||||
"""resolve_provider_client('gemini') should pass the real API key through."""
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "AIzaSy_TEST_KEY")
|
||||
real_key = "AIzaSy_TEST_KEY"
|
||||
with patch("agent.auxiliary_client.OpenAI") as mock_openai:
|
||||
mock_openai.return_value = MagicMock()
|
||||
mock_openai.return_value.api_key = "not-used"
|
||||
from agent.auxiliary_client import resolve_provider_client
|
||||
resolve_provider_client("gemini")
|
||||
call_kwargs = mock_openai.call_args[1]
|
||||
assert call_kwargs.get("api_key") == "not-used", (
|
||||
"api_key must be 'not-used' to prevent Bearer injection for Gemini"
|
||||
)
|
||||
assert call_kwargs.get("api_key") == real_key
|
||||
headers = call_kwargs.get("default_headers", {})
|
||||
assert headers.get("x-goog-api-key") == real_key, (
|
||||
"x-goog-api-key header must carry the real Gemini API key"
|
||||
)
|
||||
assert "x-goog-api-key" not in headers
|
||||
|
||||
|
||||
# ── models.dev Integration ──
|
||||
@@ -313,9 +292,32 @@ class TestGeminiModelsDev:
|
||||
result = list_agentic_models("gemini")
|
||||
assert "gemini-3-flash-preview" in result
|
||||
assert "gemini-2.5-pro" in result
|
||||
assert "gemma-4-31b-it" in result
|
||||
assert "gemma-4-31b-it" not in result
|
||||
# Filtered out:
|
||||
assert "gemini-embedding-001" not in result # no tool_call
|
||||
assert "gemini-2.5-flash-preview-tts" not in result # no tool_call
|
||||
assert "gemini-live-2.5-flash" not in result # noise: live-
|
||||
assert "gemini-2.5-flash-preview-04-17" not in result # noise: dated preview
|
||||
|
||||
def test_list_provider_models_hides_low_tpm_google_gemmas(self):
|
||||
mock_data = {
|
||||
"google": {
|
||||
"models": {
|
||||
"gemini-2.5-pro": {},
|
||||
"gemma-4-31b-it": {},
|
||||
"gemma-3-27b-it": {},
|
||||
"gemini-1.5-pro": {},
|
||||
"gemini-2.0-flash": {},
|
||||
}
|
||||
}
|
||||
}
|
||||
with patch("agent.models_dev.fetch_models_dev", return_value=mock_data):
|
||||
from agent.models_dev import list_provider_models
|
||||
|
||||
result = list_provider_models("gemini")
|
||||
|
||||
assert "gemini-2.5-pro" in result
|
||||
assert "gemma-4-31b-it" not in result
|
||||
assert "gemma-3-27b-it" not in result
|
||||
assert "gemini-1.5-pro" not in result
|
||||
assert "gemini-2.0-flash" not in result
|
||||
|
||||
@@ -141,6 +141,50 @@ class TestStreamingAccumulator:
|
||||
assert tc[0].function.name == "terminal"
|
||||
assert tc[0].function.arguments == '{"command": "ls"}'
|
||||
|
||||
@patch("run_agent.AIAgent._create_request_openai_client")
|
||||
@patch("run_agent.AIAgent._close_request_openai_client")
|
||||
def test_tool_name_not_duplicated_when_resent_per_chunk(self, mock_close, mock_create):
|
||||
"""MiniMax M2.7 via NVIDIA NIM resends the full name in every chunk.
|
||||
|
||||
Bug #8259: the old += accumulation produced "read_fileread_file".
|
||||
Assignment (matching OpenAI Node SDK / LiteLLM) prevents this.
|
||||
"""
|
||||
from run_agent import AIAgent
|
||||
|
||||
chunks = [
|
||||
_make_stream_chunk(tool_calls=[
|
||||
_make_tool_call_delta(index=0, tc_id="call_nim", name="read_file")
|
||||
]),
|
||||
_make_stream_chunk(tool_calls=[
|
||||
_make_tool_call_delta(index=0, tc_id="call_nim", name="read_file", arguments='{"path":')
|
||||
]),
|
||||
_make_stream_chunk(tool_calls=[
|
||||
_make_tool_call_delta(index=0, tc_id="call_nim", name="read_file", arguments=' "x.py"}')
|
||||
]),
|
||||
_make_stream_chunk(finish_reason="tool_calls"),
|
||||
]
|
||||
|
||||
mock_client = MagicMock()
|
||||
mock_client.chat.completions.create.return_value = iter(chunks)
|
||||
mock_create.return_value = mock_client
|
||||
|
||||
agent = AIAgent(
|
||||
model="test/model",
|
||||
quiet_mode=True,
|
||||
skip_context_files=True,
|
||||
skip_memory=True,
|
||||
)
|
||||
agent.api_mode = "chat_completions"
|
||||
agent._interrupt_requested = False
|
||||
|
||||
response = agent._interruptible_streaming_api_call({})
|
||||
|
||||
tc = response.choices[0].message.tool_calls
|
||||
assert tc is not None
|
||||
assert len(tc) == 1
|
||||
assert tc[0].function.name == "read_file"
|
||||
assert tc[0].function.arguments == '{"path": "x.py"}'
|
||||
|
||||
@patch("run_agent.AIAgent._create_request_openai_client")
|
||||
@patch("run_agent.AIAgent._close_request_openai_client")
|
||||
def test_tool_call_extra_content_preserved(self, mock_close, mock_create):
|
||||
|
||||
Reference in New Issue
Block a user