From 22cfad157b8ec9b4a8ecc0d2657567d7890d7207 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Thu, 26 Mar 2026 19:13:07 -0700
Subject: [PATCH] =?UTF-8?q?fix:=20gateway=20token=20double-counting=20?=
 =?UTF-8?q?=E2=80=94=20use=20absolute=20set=20instead=20of=20increment=20(?=
 =?UTF-8?q?#3317)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The gateway's update_session() used += for token counts, but the cached
agent's session_prompt_tokens / session_completion_tokens are cumulative
totals that grow across messages. Each update_session call re-added the
running total, inflating usage stats with every message (1.7x after 3
messages, worse over longer conversations).

Fix: change += to = for in-memory entry fields, add set_token_counts()
to SessionDB that uses direct assignment instead of SQL increment, and
switch the gateway to call it.

CLI mode continues using update_token_counts() (increment) since it
tracks per-API-call deltas — that path is unchanged.

Based on analysis from PR #3222 by @zaycruz (closed).

Co-authored-by: zaycruz <zay@users.noreply.github.com>
---
 gateway/session.py            |  2 +-
 hermes_state.py               | 66 +++++++++++++++++++++++++++++++++++
 tests/gateway/test_session.py |  2 +-
 3 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/gateway/session.py b/gateway/session.py
index 2d5376b07..5aefb6c01 100644
--- a/gateway/session.py
+++ b/gateway/session.py
@@ -785,7 +785,7 @@ class SessionStore:
 
         if self._db and db_session_id:
             try:
-                self._db.update_token_counts(
+                self._db.set_token_counts(
                     db_session_id,
                     input_tokens=input_tokens,
                     output_tokens=output_tokens,
diff --git a/hermes_state.py b/hermes_state.py
index cf03951c7..b39c9c1f7 100644
--- a/hermes_state.py
+++ b/hermes_state.py
@@ -415,6 +415,72 @@ class SessionDB:
             )
             self._conn.commit()
 
+    def set_token_counts(
+        self,
+        session_id: str,
+        input_tokens: int = 0,
+        output_tokens: int = 0,
+        model: str = None,
+        cache_read_tokens: int = 0,
+        cache_write_tokens: int = 0,
+        reasoning_tokens: int = 0,
+        estimated_cost_usd: Optional[float] = None,
+        actual_cost_usd: Optional[float] = None,
+        cost_status: Optional[str] = None,
+        cost_source: Optional[str] = None,
+        pricing_version: Optional[str] = None,
+        billing_provider: Optional[str] = None,
+        billing_base_url: Optional[str] = None,
+        billing_mode: Optional[str] = None,
+    ) -> None:
+        """Set token counters to absolute values (not increment).
+
+        Use this when the caller provides cumulative totals from a completed
+        conversation run (e.g. the gateway, where the cached agent's
+        session_prompt_tokens already reflects the running total).
+        """
+        with self._lock:
+            self._conn.execute(
+                """UPDATE sessions SET
+                   input_tokens = ?,
+                   output_tokens = ?,
+                   cache_read_tokens = ?,
+                   cache_write_tokens = ?,
+                   reasoning_tokens = ?,
+                   estimated_cost_usd = ?,
+                   actual_cost_usd = CASE
+                       WHEN ? IS NULL THEN actual_cost_usd
+                       ELSE ?
+                   END,
+                   cost_status = COALESCE(?, cost_status),
+                   cost_source = COALESCE(?, cost_source),
+                   pricing_version = COALESCE(?, pricing_version),
+                   billing_provider = COALESCE(billing_provider, ?),
+                   billing_base_url = COALESCE(billing_base_url, ?),
+                   billing_mode = COALESCE(billing_mode, ?),
+                   model = COALESCE(model, ?)
+                   WHERE id = ?""",
+                (
+                    input_tokens,
+                    output_tokens,
+                    cache_read_tokens,
+                    cache_write_tokens,
+                    reasoning_tokens,
+                    estimated_cost_usd,
+                    actual_cost_usd,
+                    actual_cost_usd,
+                    cost_status,
+                    cost_source,
+                    pricing_version,
+                    billing_provider,
+                    billing_base_url,
+                    billing_mode,
+                    model,
+                    session_id,
+                ),
+            )
+            self._conn.commit()
+
     def get_session(self, session_id: str) -> Optional[Dict[str, Any]]:
         """Get a session by ID."""
         with self._lock:
diff --git a/tests/gateway/test_session.py b/tests/gateway/test_session.py
index 226e50593..82281acc2 100644
--- a/tests/gateway/test_session.py
+++ b/tests/gateway/test_session.py
@@ -846,7 +846,7 @@ class TestLastPromptTokens:
 
         store.update_session("k1", model="openai/gpt-5.4")
 
-        store._db.update_token_counts.assert_called_once_with(
+        store._db.set_token_counts.assert_called_once_with(
             "s1",
             input_tokens=0,
             output_tokens=0,