From 22cfad157b8ec9b4a8ecc0d2657567d7890d7207 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Thu, 26 Mar 2026 19:13:07 -0700 Subject: [PATCH] =?UTF-8?q?fix:=20gateway=20token=20double-counting=20?= =?UTF-8?q?=E2=80=94=20use=20absolute=20set=20instead=20of=20increment=20(?= =?UTF-8?q?#3317)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The gateway's update_session() used += for token counts, but the cached agent's session_prompt_tokens / session_completion_tokens are cumulative totals that grow across messages. Each update_session call re-added the running total, inflating usage stats with every message (1.7x after 3 messages, worse over longer conversations). Fix: change += to = for in-memory entry fields, add set_token_counts() to SessionDB that uses direct assignment instead of SQL increment, and switch the gateway to call it. CLI mode continues using update_token_counts() (increment) since it tracks per-API-call deltas — that path is unchanged. Based on analysis from PR #3222 by @zaycruz (closed). Co-authored-by: zaycruz --- gateway/session.py | 2 +- hermes_state.py | 66 +++++++++++++++++++++++++++++++++++ tests/gateway/test_session.py | 2 +- 3 files changed, 68 insertions(+), 2 deletions(-) diff --git a/gateway/session.py b/gateway/session.py index 2d5376b07..5aefb6c01 100644 --- a/gateway/session.py +++ b/gateway/session.py @@ -785,7 +785,7 @@ class SessionStore: if self._db and db_session_id: try: - self._db.update_token_counts( + self._db.set_token_counts( db_session_id, input_tokens=input_tokens, output_tokens=output_tokens, diff --git a/hermes_state.py b/hermes_state.py index cf03951c7..b39c9c1f7 100644 --- a/hermes_state.py +++ b/hermes_state.py @@ -415,6 +415,72 @@ class SessionDB: ) self._conn.commit() + def set_token_counts( + self, + session_id: str, + input_tokens: int = 0, + output_tokens: int = 0, + model: str = None, + cache_read_tokens: int = 0, + cache_write_tokens: int = 0, + reasoning_tokens: int = 0, + estimated_cost_usd: Optional[float] = None, + actual_cost_usd: Optional[float] = None, + cost_status: Optional[str] = None, + cost_source: Optional[str] = None, + pricing_version: Optional[str] = None, + billing_provider: Optional[str] = None, + billing_base_url: Optional[str] = None, + billing_mode: Optional[str] = None, + ) -> None: + """Set token counters to absolute values (not increment). + + Use this when the caller provides cumulative totals from a completed + conversation run (e.g. the gateway, where the cached agent's + session_prompt_tokens already reflects the running total). + """ + with self._lock: + self._conn.execute( + """UPDATE sessions SET + input_tokens = ?, + output_tokens = ?, + cache_read_tokens = ?, + cache_write_tokens = ?, + reasoning_tokens = ?, + estimated_cost_usd = ?, + actual_cost_usd = CASE + WHEN ? IS NULL THEN actual_cost_usd + ELSE ? + END, + cost_status = COALESCE(?, cost_status), + cost_source = COALESCE(?, cost_source), + pricing_version = COALESCE(?, pricing_version), + billing_provider = COALESCE(billing_provider, ?), + billing_base_url = COALESCE(billing_base_url, ?), + billing_mode = COALESCE(billing_mode, ?), + model = COALESCE(model, ?) + WHERE id = ?""", + ( + input_tokens, + output_tokens, + cache_read_tokens, + cache_write_tokens, + reasoning_tokens, + estimated_cost_usd, + actual_cost_usd, + actual_cost_usd, + cost_status, + cost_source, + pricing_version, + billing_provider, + billing_base_url, + billing_mode, + model, + session_id, + ), + ) + self._conn.commit() + def get_session(self, session_id: str) -> Optional[Dict[str, Any]]: """Get a session by ID.""" with self._lock: diff --git a/tests/gateway/test_session.py b/tests/gateway/test_session.py index 226e50593..82281acc2 100644 --- a/tests/gateway/test_session.py +++ b/tests/gateway/test_session.py @@ -846,7 +846,7 @@ class TestLastPromptTokens: store.update_session("k1", model="openai/gpt-5.4") - store._db.update_token_counts.assert_called_once_with( + store._db.set_token_counts.assert_called_once_with( "s1", input_tokens=0, output_tokens=0,