From 0f778f776877cd452cf7475f06a6044cf07ebfe8 Mon Sep 17 00:00:00 2001
From: jarvischer <jarvischer@gmail.com>
Date: Sat, 18 Apr 2026 22:46:36 +0530
Subject: [PATCH 1/5] fix: prevent tool name duplication in streaming
 accumulator (MiniMax/NVIDIA NIM)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Based on #11984 by @maxchernin.  Fixes #8259.

Some providers (MiniMax M2.7 via NVIDIA NIM) resend the full function
name in every streaming chunk instead of only the first.  The old
accumulator used += which concatenated them into 'read_fileread_file'.

Changed to simple assignment (=), matching the OpenAI Node SDK, LiteLLM,
and Vercel AI SDK patterns.  Function names are atomic identifiers
delivered complete — no provider splits them across chunks, so
concatenation was never correct semantics.
---
 run_agent.py                      | 10 ++++++-
 scripts/release.py                |  1 +
 tests/run_agent/test_streaming.py | 44 +++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/run_agent.py b/run_agent.py
index a47455e53..e88096a60 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -5868,7 +5868,15 @@ class AIAgent:
                             entry["id"] = tc_delta.id
                         if tc_delta.function:
                             if tc_delta.function.name:
-                                entry["function"]["name"] += tc_delta.function.name
+                                # Use assignment, not +=.  Function names are
+                                # atomic identifiers delivered complete in the
+                                # first chunk (OpenAI spec).  Some providers
+                                # (MiniMax M2.7 via NVIDIA NIM) resend the full
+                                # name in every chunk; concatenation would
+                                # produce "read_fileread_file".  Assignment
+                                # (matching the OpenAI Node SDK / LiteLLM /
+                                # Vercel AI patterns) is immune to this.
+                                entry["function"]["name"] = tc_delta.function.name
                             if tc_delta.function.arguments:
                                 entry["function"]["arguments"] += tc_delta.function.arguments
                         extra = getattr(tc_delta, "extra_content", None)
diff --git a/scripts/release.py b/scripts/release.py
index 4c32dccfd..88ddb2f43 100755
--- a/scripts/release.py
+++ b/scripts/release.py
@@ -266,6 +266,7 @@ AUTHOR_MAP = {
     "limkuan24@gmail.com": "WideLee",
     "aviralarora002@gmail.com": "AviArora02-commits",
     "junminliu@gmail.com": "JimLiu",
+    "jarvischer@gmail.com": "maxchernin",
 }
 
 
diff --git a/tests/run_agent/test_streaming.py b/tests/run_agent/test_streaming.py
index 6afe36ee3..e4825599a 100644
--- a/tests/run_agent/test_streaming.py
+++ b/tests/run_agent/test_streaming.py
@@ -141,6 +141,50 @@ class TestStreamingAccumulator:
         assert tc[0].function.name == "terminal"
         assert tc[0].function.arguments == '{"command": "ls"}'
 
+    @patch("run_agent.AIAgent._create_request_openai_client")
+    @patch("run_agent.AIAgent._close_request_openai_client")
+    def test_tool_name_not_duplicated_when_resent_per_chunk(self, mock_close, mock_create):
+        """MiniMax M2.7 via NVIDIA NIM resends the full name in every chunk.
+
+        Bug #8259: the old += accumulation produced "read_fileread_file".
+        Assignment (matching OpenAI Node SDK / LiteLLM) prevents this.
+        """
+        from run_agent import AIAgent
+
+        chunks = [
+            _make_stream_chunk(tool_calls=[
+                _make_tool_call_delta(index=0, tc_id="call_nim", name="read_file")
+            ]),
+            _make_stream_chunk(tool_calls=[
+                _make_tool_call_delta(index=0, tc_id="call_nim", name="read_file", arguments='{"path":')
+            ]),
+            _make_stream_chunk(tool_calls=[
+                _make_tool_call_delta(index=0, tc_id="call_nim", name="read_file", arguments=' "x.py"}')
+            ]),
+            _make_stream_chunk(finish_reason="tool_calls"),
+        ]
+
+        mock_client = MagicMock()
+        mock_client.chat.completions.create.return_value = iter(chunks)
+        mock_create.return_value = mock_client
+
+        agent = AIAgent(
+            model="test/model",
+            quiet_mode=True,
+            skip_context_files=True,
+            skip_memory=True,
+        )
+        agent.api_mode = "chat_completions"
+        agent._interrupt_requested = False
+
+        response = agent._interruptible_streaming_api_call({})
+
+        tc = response.choices[0].message.tool_calls
+        assert tc is not None
+        assert len(tc) == 1
+        assert tc[0].function.name == "read_file"
+        assert tc[0].function.arguments == '{"path": "x.py"}'
+
     @patch("run_agent.AIAgent._create_request_openai_client")
     @patch("run_agent.AIAgent._close_request_openai_client")
     def test_tool_call_extra_content_preserved(self, mock_close, mock_create):

From f7af90e2daf2e2a11262ff3152bb3f08ff13ca37 Mon Sep 17 00:00:00 2001
From: LVT382009 <levantam.98.2324@gmail.com>
Date: Sat, 18 Apr 2026 22:49:30 +0530
Subject: [PATCH 2/5] fix: wire _ephemeral_max_output_tokens into
 chat_completions and add NVIDIA NIM default

Based on #12152 by @LVT382009.

Two fixes to run_agent.py:

1. _ephemeral_max_output_tokens consumption in chat_completions path:
   The error-recovery ephemeral override was only consumed in the
   anthropic_messages branch of _build_api_kwargs.  All chat_completions
   providers (OpenRouter, NVIDIA NIM, Qwen, Alibaba, custom, etc.)
   silently ignored it.  Now consumed at highest priority, matching the
   anthropic pattern.

2. NVIDIA NIM max_tokens default (16384):
   NVIDIA NIM falls back to a very low internal default when max_tokens
   is omitted, causing models like GLM-4.7 to truncate immediately
   (thinking tokens exhaust the budget before the response starts).

3. Progressive length-continuation boost:
   When finish_reason='length' triggers a continuation retry, the output
   budget now grows progressively (2x base on retry 1, 3x on retry 2,
   capped at 32768) via _ephemeral_max_output_tokens.  Previously the
   retry loop just re-sent the same token limit on all 3 attempts.
---
 run_agent.py       | 20 +++++++++++++++++++-
 scripts/release.py |  1 +
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/run_agent.py b/run_agent.py
index e88096a60..a0f4db548 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -7061,8 +7061,20 @@ class AIAgent:
         if self.tools:
             api_kwargs["tools"] = self.tools
 
-        if self.max_tokens is not None:
+        # ── max_tokens for chat_completions ──────────────────────────────
+        # Priority: ephemeral override (error recovery / length-continuation
+        # boost) > user-configured max_tokens > provider-specific defaults.
+        _ephemeral_out = getattr(self, "_ephemeral_max_output_tokens", None)
+        if _ephemeral_out is not None:
+            self._ephemeral_max_output_tokens = None  # consume immediately
+            api_kwargs.update(self._max_tokens_param(_ephemeral_out))
+        elif self.max_tokens is not None:
             api_kwargs.update(self._max_tokens_param(self.max_tokens))
+        elif "integrate.api.nvidia.com" in self._base_url_lower:
+            # NVIDIA NIM defaults to a very low max_tokens when omitted,
+            # causing models like GLM-4.7 to truncate immediately (thinking
+            # tokens alone exhaust the budget).  16384 provides adequate room.
+            api_kwargs.update(self._max_tokens_param(16384))
         elif self._is_qwen_portal():
             # Qwen Portal defaults to a very low max_tokens when omitted.
             # Reasoning models (qwen3-coder-plus) exhaust that budget on
@@ -10804,6 +10816,12 @@ class AIAgent:
                 continue
 
             if restart_with_length_continuation:
+                # Progressively boost the output token budget on each retry.
+                # Retry 1 → 2× base, retry 2 → 3× base, capped at 32 768.
+                # Applies to all providers via _ephemeral_max_output_tokens.
+                _boost_base = self.max_tokens if self.max_tokens else 4096
+                _boost = _boost_base * (length_continue_retries + 1)
+                self._ephemeral_max_output_tokens = min(_boost, 32768)
                 continue
 
             # Guard: if all retries exhausted without a successful response
diff --git a/scripts/release.py b/scripts/release.py
index 88ddb2f43..94ebef5d3 100755
--- a/scripts/release.py
+++ b/scripts/release.py
@@ -267,6 +267,7 @@ AUTHOR_MAP = {
     "aviralarora002@gmail.com": "AviArora02-commits",
     "junminliu@gmail.com": "JimLiu",
     "jarvischer@gmail.com": "maxchernin",
+    "levantam.98.2324@gmail.com": "LVT382009",
 }
 
 

From 2eab7ee15f9f0283ae1a6c466b0400caa44defbb Mon Sep 17 00:00:00 2001
From: helix4u <4317663+helix4u@users.noreply.github.com>
Date: Sat, 18 Apr 2026 13:00:04 -0600
Subject: [PATCH 3/5] fix(gemini): hide low-TPM Gemma models from exposed lists

---
 agent/models_dev.py                      | 32 +++++++++++++++++++++---
 hermes_cli/models.py                     |  2 --
 hermes_cli/setup.py                      |  1 -
 tests/hermes_cli/test_gemini_provider.py | 23 +++++++++++++++--
 4 files changed, 50 insertions(+), 8 deletions(-)

diff --git a/agent/models_dev.py b/agent/models_dev.py
index 42c8925ff..cc4dbf0be 100644
--- a/agent/models_dev.py
+++ b/agent/models_dev.py
@@ -420,7 +420,10 @@ def list_provider_models(provider: str) -> List[str]:
     models = _get_provider_models(provider)
     if models is None:
         return []
-    return list(models.keys())
+    return [
+        mid for mid in models.keys()
+        if not _should_hide_from_provider_catalog(provider, mid)
+    ]
 
 
 # Patterns that indicate non-agentic or noise models (TTS, embedding,
@@ -432,6 +435,29 @@ _NOISE_PATTERNS: re.Pattern = re.compile(
     re.IGNORECASE,
 )
 
+# Google-hosted Gemma models currently have very low TPM quotas for agent-style
+# traffic (for example 15K/16K TPM tiers in AI Studio) and are not practical as
+# normal Hermes picks even though they advertise large context windows. Keep the
+# capability metadata available for direct/manual use, but hide them from the
+# Gemini model catalogs we surface in setup and model selection.
+_GOOGLE_GEMMA_HIDDEN_MODELS = frozenset({
+    "gemma-4-31b-it",
+    "gemma-4-26b-a4b-it",
+    "gemma-3-1b",
+    "gemma-3-2b",
+    "gemma-3-4b",
+    "gemma-3-12b",
+    "gemma-3-27b",
+})
+
+
+def _should_hide_from_provider_catalog(provider: str, model_id: str) -> bool:
+    provider_lower = (provider or "").strip().lower()
+    model_lower = (model_id or "").strip().lower()
+    if provider_lower in {"gemini", "google"} and model_lower in _GOOGLE_GEMMA_HIDDEN_MODELS:
+        return True
+    return False
+
 
 def list_agentic_models(provider: str) -> List[str]:
     """Return model IDs suitable for agentic use from models.dev.
@@ -448,6 +474,8 @@ def list_agentic_models(provider: str) -> List[str]:
     for mid, entry in models.items():
         if not isinstance(entry, dict):
             continue
+        if _should_hide_from_provider_catalog(provider, mid):
+            continue
         if not entry.get("tool_call", False):
             continue
         if _NOISE_PATTERNS.search(mid):
@@ -582,5 +610,3 @@ def get_model_info(
             return _parse_model_info(mid, mdata, mdev_id)
 
     return None
-
-
diff --git a/hermes_cli/models.py b/hermes_cli/models.py
index cbbeef62d..a0d7c2220 100644
--- a/hermes_cli/models.py
+++ b/hermes_cli/models.py
@@ -133,8 +133,6 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
         "gemini-2.5-pro",
         "gemini-2.5-flash",
         "gemini-2.5-flash-lite",
-        # Gemma open models (also served via AI Studio)
-        "gemma-4-31b-it",
     ],
     "google-gemini-cli": [
         "gemini-2.5-pro",
diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py
index 8770386b7..8f6b633c6 100644
--- a/hermes_cli/setup.py
+++ b/hermes_cli/setup.py
@@ -91,7 +91,6 @@ _DEFAULT_PROVIDER_MODELS = {
     "gemini": [
         "gemini-3.1-pro-preview", "gemini-3-flash-preview", "gemini-3.1-flash-lite-preview",
         "gemini-2.5-pro", "gemini-2.5-flash", "gemini-2.5-flash-lite",
-        "gemma-4-31b-it",
     ],
     "zai": ["glm-5.1", "glm-5", "glm-4.7", "glm-4.5", "glm-4.5-flash"],
     "kimi-coding": ["kimi-k2.5", "kimi-k2-thinking", "kimi-k2-turbo-preview"],
diff --git a/tests/hermes_cli/test_gemini_provider.py b/tests/hermes_cli/test_gemini_provider.py
index fd16e825d..7632f7691 100644
--- a/tests/hermes_cli/test_gemini_provider.py
+++ b/tests/hermes_cli/test_gemini_provider.py
@@ -130,7 +130,7 @@ class TestGeminiModelCatalog:
         models = _PROVIDER_MODELS["gemini"]
         assert "gemini-2.5-pro" in models
         assert "gemini-2.5-flash" in models
-        assert "gemma-4-31b-it" in models
+        assert "gemma-4-31b-it" not in models
 
     def test_provider_models_has_3x(self):
         models = _PROVIDER_MODELS["gemini"]
@@ -313,9 +313,28 @@ class TestGeminiModelsDev:
             result = list_agentic_models("gemini")
         assert "gemini-3-flash-preview" in result
         assert "gemini-2.5-pro" in result
-        assert "gemma-4-31b-it" in result
+        assert "gemma-4-31b-it" not in result
         # Filtered out:
         assert "gemini-embedding-001" not in result      # no tool_call
         assert "gemini-2.5-flash-preview-tts" not in result  # no tool_call
         assert "gemini-live-2.5-flash" not in result     # noise: live-
         assert "gemini-2.5-flash-preview-04-17" not in result  # noise: dated preview
+
+    def test_list_provider_models_hides_low_tpm_google_gemmas(self):
+        mock_data = {
+            "google": {
+                "models": {
+                    "gemini-2.5-pro": {},
+                    "gemma-4-31b-it": {},
+                    "gemma-3-1b": {},
+                }
+            }
+        }
+        with patch("agent.models_dev.fetch_models_dev", return_value=mock_data):
+            from agent.models_dev import list_provider_models
+
+            result = list_provider_models("gemini")
+
+        assert "gemini-2.5-pro" in result
+        assert "gemma-4-31b-it" not in result
+        assert "gemma-3-1b" not in result

From a7dd6a34499cb8fa91579b8943d251a8c2d42021 Mon Sep 17 00:00:00 2001
From: helix4u <4317663+helix4u@users.noreply.github.com>
Date: Sat, 18 Apr 2026 13:08:57 -0600
Subject: [PATCH 4/5] fix(gemini): hide stale and low-TPM Google models

---
 agent/models_dev.py                      | 28 ++++++++++++++++++------
 tests/hermes_cli/test_gemini_provider.py |  8 +++++--
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/agent/models_dev.py b/agent/models_dev.py
index cc4dbf0be..3e5c911e7 100644
--- a/agent/models_dev.py
+++ b/agent/models_dev.py
@@ -435,26 +435,40 @@ _NOISE_PATTERNS: re.Pattern = re.compile(
     re.IGNORECASE,
 )
 
-# Google-hosted Gemma models currently have very low TPM quotas for agent-style
-# traffic (for example 15K/16K TPM tiers in AI Studio) and are not practical as
-# normal Hermes picks even though they advertise large context windows. Keep the
-# capability metadata available for direct/manual use, but hide them from the
-# Gemini model catalogs we surface in setup and model selection.
-_GOOGLE_GEMMA_HIDDEN_MODELS = frozenset({
+# Google's live Gemini catalogs currently include a mix of stale slugs and
+# Gemma models whose TPM quotas are too small for normal Hermes agent traffic.
+# Keep capability metadata available for direct/manual use, but hide these from
+# the Gemini model catalogs we surface in setup and model selection.
+_GOOGLE_HIDDEN_MODELS = frozenset({
+    # Low-TPM Gemma models that trip Google input-token quota walls under
+    # agent-style traffic despite advertising large context windows.
     "gemma-4-31b-it",
+    "gemma-4-26b-it",
     "gemma-4-26b-a4b-it",
     "gemma-3-1b",
+    "gemma-3-1b-it",
     "gemma-3-2b",
+    "gemma-3-2b-it",
     "gemma-3-4b",
+    "gemma-3-4b-it",
     "gemma-3-12b",
+    "gemma-3-12b-it",
     "gemma-3-27b",
+    "gemma-3-27b-it",
+    # Stale/retired Google slugs that still surface through models.dev-backed
+    # Gemini selection but 404 on the current Google endpoints.
+    "gemini-1.5-flash",
+    "gemini-1.5-pro",
+    "gemini-1.5-flash-8b",
+    "gemini-2.0-flash",
+    "gemini-2.0-flash-lite",
 })
 
 
 def _should_hide_from_provider_catalog(provider: str, model_id: str) -> bool:
     provider_lower = (provider or "").strip().lower()
     model_lower = (model_id or "").strip().lower()
-    if provider_lower in {"gemini", "google"} and model_lower in _GOOGLE_GEMMA_HIDDEN_MODELS:
+    if provider_lower in {"gemini", "google"} and model_lower in _GOOGLE_HIDDEN_MODELS:
         return True
     return False
 
diff --git a/tests/hermes_cli/test_gemini_provider.py b/tests/hermes_cli/test_gemini_provider.py
index 7632f7691..9c6ee70aa 100644
--- a/tests/hermes_cli/test_gemini_provider.py
+++ b/tests/hermes_cli/test_gemini_provider.py
@@ -326,7 +326,9 @@ class TestGeminiModelsDev:
                 "models": {
                     "gemini-2.5-pro": {},
                     "gemma-4-31b-it": {},
-                    "gemma-3-1b": {},
+                    "gemma-3-27b-it": {},
+                    "gemini-1.5-pro": {},
+                    "gemini-2.0-flash": {},
                 }
             }
         }
@@ -337,4 +339,6 @@ class TestGeminiModelsDev:
 
         assert "gemini-2.5-pro" in result
         assert "gemma-4-31b-it" not in result
-        assert "gemma-3-1b" not in result
+        assert "gemma-3-27b-it" not in result
+        assert "gemini-1.5-pro" not in result
+        assert "gemini-2.0-flash" not in result

From ca32a2a60bd8655c001b96394e68309ba53b4550 Mon Sep 17 00:00:00 2001
From: helix4u <4317663+helix4u@users.noreply.github.com>
Date: Sat, 18 Apr 2026 13:15:27 -0600
Subject: [PATCH 5/5] fix(gemini): restore bearer auth on openai route

---
 agent/auxiliary_client.py                | 28 -------------------
 run_agent.py                             | 21 --------------
 tests/hermes_cli/test_gemini_provider.py | 35 +++++-------------------
 3 files changed, 7 insertions(+), 77 deletions(-)

diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py
index 126f4615d..19bde946e 100644
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@@ -782,15 +782,6 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:
                 from hermes_cli.models import copilot_default_headers
 
                 extra["default_headers"] = copilot_default_headers()
-            elif "generativelanguage.googleapis.com" in base_url.lower():
-                # Google's OpenAI-compatible endpoint only accepts x-goog-api-key.
-                # Passing api_key= causes the SDK to inject Authorization: Bearer,
-                # which Google rejects with HTTP 400 "Multiple authentication
-                # credentials received". Use a placeholder for api_key and pass
-                # the real key via x-goog-api-key header instead.
-                # Fixes: https://github.com/NousResearch/hermes-agent/issues/7893
-                extra["default_headers"] = {"x-goog-api-key": api_key}
-                api_key = "not-used"
             return OpenAI(api_key=api_key, base_url=base_url, **extra), model
 
         creds = resolve_api_key_provider_credentials(provider_id)
@@ -812,15 +803,6 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:
             from hermes_cli.models import copilot_default_headers
 
             extra["default_headers"] = copilot_default_headers()
-        elif "generativelanguage.googleapis.com" in base_url.lower():
-            # Google's OpenAI-compatible endpoint only accepts x-goog-api-key.
-            # Passing api_key= causes the SDK to inject Authorization: Bearer,
-            # which Google rejects with HTTP 400 "Multiple authentication
-            # credentials received". Use a placeholder for api_key and pass
-            # the real key via x-goog-api-key header instead.
-            # Fixes: https://github.com/NousResearch/hermes-agent/issues/7893
-            extra["default_headers"] = {"x-goog-api-key": api_key}
-            api_key = "not-used"
         return OpenAI(api_key=api_key, base_url=base_url, **extra), model
 
     return None, None
@@ -1666,16 +1648,6 @@ def resolve_provider_client(
             from hermes_cli.models import copilot_default_headers
 
             headers.update(copilot_default_headers())
-        elif "generativelanguage.googleapis.com" in base_url.lower():
-            # Google's OpenAI-compatible endpoint only accepts x-goog-api-key.
-            # Passing api_key= causes the OpenAI SDK to inject Authorization: Bearer,
-            # which Google rejects with HTTP 400 "Multiple authentication credentials
-            # received". Use a placeholder for api_key and pass the real key via
-            # x-goog-api-key header instead.
-            # Fixes: https://github.com/NousResearch/hermes-agent/issues/7893
-            headers["x-goog-api-key"] = api_key
-            api_key = "not-used"
-
         client = OpenAI(api_key=api_key, base_url=base_url,
                         **({"default_headers": headers} if headers else {}))
 
diff --git a/run_agent.py b/run_agent.py
index a0f4db548..756bb62ed 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -1054,16 +1054,6 @@ class AIAgent:
                     }
                 elif "portal.qwen.ai" in effective_base.lower():
                     client_kwargs["default_headers"] = _qwen_portal_headers()
-                elif "generativelanguage.googleapis.com" in effective_base.lower():
-                    # Google's OpenAI-compatible endpoint only accepts x-goog-api-key.
-                    # The OpenAI SDK auto-injects Authorization: Bearer when api_key= is
-                    # set to a real value, causing HTTP 400 "Multiple authentication
-                    # credentials received".  Pass a placeholder so the SDK does not
-                    # emit Bearer, and carry the real key via x-goog-api-key instead.
-                    # Fixes: https://github.com/NousResearch/hermes-agent/issues/7893
-                    real_key = client_kwargs["api_key"]
-                    client_kwargs["api_key"] = "not-used"
-                    client_kwargs["default_headers"] = {"x-goog-api-key": real_key}
             else:
                 # No explicit creds — use the centralized provider router
                 from agent.auxiliary_client import resolve_provider_client
@@ -5245,17 +5235,6 @@ class AIAgent:
             self._client_kwargs["default_headers"] = {"User-Agent": "KimiCLI/1.30.0"}
         elif "portal.qwen.ai" in normalized:
             self._client_kwargs["default_headers"] = _qwen_portal_headers()
-        elif "generativelanguage.googleapis.com" in normalized:
-            # Google's endpoint rejects Bearer tokens; use x-goog-api-key instead.
-            # Swap the real key out of api_key and into the header so the OpenAI
-            # SDK does not emit Authorization: Bearer.
-            # Fixes: https://github.com/NousResearch/hermes-agent/issues/7893
-            real_key = self._client_kwargs.get("api_key", "")
-            if real_key and real_key != "not-used":
-                self._client_kwargs["api_key"] = "not-used"
-            self._client_kwargs["default_headers"] = {
-                "x-goog-api-key": real_key or self._client_kwargs.get("api_key", ""),
-            }
         else:
             self._client_kwargs.pop("default_headers", None)
 
diff --git a/tests/hermes_cli/test_gemini_provider.py b/tests/hermes_cli/test_gemini_provider.py
index 9c6ee70aa..dbb1111fc 100644
--- a/tests/hermes_cli/test_gemini_provider.py
+++ b/tests/hermes_cli/test_gemini_provider.py
@@ -207,14 +207,8 @@ class TestGeminiAgentInit:
             assert agent.api_mode == "chat_completions"
             assert agent.provider == "gemini"
 
-    def test_gemini_uses_x_goog_api_key_not_bearer(self, monkeypatch):
-        """Regression test for issue #7893.
-
-        When provider=gemini, the OpenAI client must be constructed with
-        api_key='not-used' and default_headers={'x-goog-api-key': real_key}.
-        This prevents the SDK from injecting Authorization: Bearer, which
-        Google's endpoint rejects with HTTP 400.
-        """
+    def test_gemini_uses_bearer_auth(self, monkeypatch):
+        """Gemini OpenAI-compatible endpoint should receive the real API key."""
         monkeypatch.setenv("GOOGLE_API_KEY", "AIzaSy_REAL_KEY")
         real_key = "AIzaSy_REAL_KEY"
         with patch("run_agent.OpenAI") as mock_openai:
@@ -227,37 +221,22 @@ class TestGeminiAgentInit:
                 base_url="https://generativelanguage.googleapis.com/v1beta/openai",
             )
         call_kwargs = mock_openai.call_args[1]
-        # The SDK must NOT receive the real key as api_key (which would emit Bearer)
-        assert call_kwargs.get("api_key") == "not-used", (
-            "api_key must be 'not-used' to suppress Authorization: Bearer for Gemini"
-        )
-        # The real key must be in x-goog-api-key header
+        assert call_kwargs.get("api_key") == real_key
         headers = call_kwargs.get("default_headers", {})
-        assert headers.get("x-goog-api-key") == real_key, (
-            "x-goog-api-key header must carry the real Gemini API key"
-        )
+        assert "x-goog-api-key" not in headers
 
     def test_gemini_resolve_provider_client_auth(self, monkeypatch):
-        """Regression test for issue #7893 — resolve_provider_client path.
-
-        When resolve_provider_client('gemini') is called, the returned OpenAI
-        client must use x-goog-api-key header, not Authorization: Bearer.
-        """
+        """resolve_provider_client('gemini') should pass the real API key through."""
         monkeypatch.setenv("GEMINI_API_KEY", "AIzaSy_TEST_KEY")
         real_key = "AIzaSy_TEST_KEY"
         with patch("agent.auxiliary_client.OpenAI") as mock_openai:
             mock_openai.return_value = MagicMock()
-            mock_openai.return_value.api_key = "not-used"
             from agent.auxiliary_client import resolve_provider_client
             resolve_provider_client("gemini")
         call_kwargs = mock_openai.call_args[1]
-        assert call_kwargs.get("api_key") == "not-used", (
-            "api_key must be 'not-used' to prevent Bearer injection for Gemini"
-        )
+        assert call_kwargs.get("api_key") == real_key
         headers = call_kwargs.get("default_headers", {})
-        assert headers.get("x-goog-api-key") == real_key, (
-            "x-goog-api-key header must carry the real Gemini API key"
-        )
+        assert "x-goog-api-key" not in headers
 
 
 # ── models.dev Integration ──