fix(state): JSON-encode multimodal message content for sqlite

sqlite3 can only bind str/bytes/int/float/None to query parameters. Multimodal message content is a list of parts (text + image_url), which raised 'Error binding parameter 3: type list is not supported' in append_message and replace_messages. In the CLI/TUI this surfaced as a visible crash when users pasted screenshots. In the gateway it was silently swallowed by a bare except in append_to_transcript, causing multimodal turns to be lost from the session transcript. Fix at the DB layer: _encode_content wraps lists/dicts as '\\x00json:' + json.dumps(...) on write, _decode_content unwraps on read. Plain strings are untouched, so existing FTS search, previews, and JSONL compat are unaffected. Paired decode in get_messages, get_messages_as_conversation, and search_messages context previews. Regression test covers: list content round-trip, dict content round-trip, string content stored unchanged, replace_messages with multimodal content. Also included: aligned fix #17522 for TUI image attachment with paths containing spaces (see previous commit).
2026-04-30 20:22:40 -07:00
parent cc340c4a4d
commit 531ac20408
3 changed files with 162 additions and 12 deletions
--- a/hermes_state.py
+++ b/hermes_state.py
@@ -1154,6 +1154,48 @@ class SessionDB:
    # Message storage
    # =========================================================================

+    # Sentinel prefix used to distinguish JSON-encoded structured content
+    # (multimodal messages: lists of parts like text + image_url) from plain
+    # string content. The NUL byte is not legal in normal text, so this
+    # cannot collide with real user content.
+    _CONTENT_JSON_PREFIX = "\x00json:"
+
+    @classmethod
+    def _encode_content(cls, content: Any) -> Any:
+        """Serialize structured (list/dict) message content for sqlite.
+
+        sqlite3 can only bind ``str``, ``bytes``, ``int``, ``float``, and ``None``
+        to query parameters. Multimodal messages have ``content`` as a list of
+        parts (``[{"type": "text", ...}, {"type": "image_url", ...}]``), which
+        raises ``ProgrammingError: Error binding parameter N: type 'list' is
+        not supported`` when bound directly.
+
+        Returns the value unchanged when it's already a safe scalar, or a
+        sentinel-prefixed JSON string for lists/dicts. Paired with
+        :meth:`_decode_content` on read.
+        """
+        if content is None or isinstance(content, (str, bytes, int, float)):
+            return content
+        try:
+            return cls._CONTENT_JSON_PREFIX + json.dumps(content)
+        except (TypeError, ValueError):
+            # Last-resort fallback: stringify so persistence never fails.
+            return str(content)
+
+    @classmethod
+    def _decode_content(cls, content: Any) -> Any:
+        """Reverse :meth:`_encode_content`; returns scalars unchanged."""
+        if isinstance(content, str) and content.startswith(cls._CONTENT_JSON_PREFIX):
+            try:
+                return json.loads(content[len(cls._CONTENT_JSON_PREFIX):])
+            except (json.JSONDecodeError, TypeError):
+                logger.warning(
+                    "Failed to decode JSON-encoded message content; "
+                    "returning raw string"
+                )
+                return content
+        return content
+
    def append_message(
        self,
        session_id: str,
@@ -1190,6 +1232,9 @@ class SessionDB:
            if codex_message_items else None
        )
        tool_calls_json = json.dumps(tool_calls) if tool_calls else None
+        # Multimodal content (list of parts) must be JSON-encoded: sqlite3
+        # cannot bind list/dict parameters directly.
+        stored_content = self._encode_content(content)

        # Pre-compute tool call count
        num_tool_calls = 0
@@ -1206,7 +1251,7 @@ class SessionDB:
                (
                    session_id,
                    role,
-                    content,
+                    stored_content,
                    tool_call_id,
                    tool_calls_json,
                    tool_name,
@@ -1289,7 +1334,7 @@ class SessionDB:
                    (
                        session_id,
                        role,
-                        msg.get("content"),
+                        self._encode_content(msg.get("content")),
                        msg.get("tool_call_id"),
                        tool_calls_json,
                        msg.get("tool_name"),
@@ -1328,6 +1373,8 @@ class SessionDB:
        result = []
        for row in rows:
            msg = dict(row)
+            if "content" in msg:
+                msg["content"] = self._decode_content(msg["content"])
            if msg.get("tool_calls"):
                try:
                    msg["tool_calls"] = json.loads(msg["tool_calls"])
@@ -1425,7 +1472,7 @@ class SessionDB:

        messages = []
        for row in rows:
-            content = row["content"]
+            content = self._decode_content(row["content"])
            if row["role"] in {"user", "assistant"} and isinstance(content, str):
                content = sanitize_context(content).strip()
            msg = {"role": row["role"], "content": content}
@@ -1810,10 +1857,26 @@ class SessionDB:
                           )""",
                        (match["id"], match["id"]),
                    )
-                    context_msgs = [
-                        {"role": r["role"], "content": (r["content"] or "")[:200]}
-                        for r in ctx_cursor.fetchall()
-                    ]
+                    context_msgs = []
+                    for r in ctx_cursor.fetchall():
+                        raw = r["content"]
+                        decoded = self._decode_content(raw)
+                        # Multimodal context: render a compact text-only
+                        # summary for search previews.
+                        if isinstance(decoded, list):
+                            text_parts = [
+                                p.get("text", "") for p in decoded
+                                if isinstance(p, dict) and p.get("type") == "text"
+                            ]
+                            text = " ".join(t for t in text_parts if t).strip()
+                            preview = text or "[multimodal content]"
+                        elif isinstance(decoded, str):
+                            preview = decoded
+                        else:
+                            preview = ""
+                        context_msgs.append(
+                            {"role": r["role"], "content": preview[:200]}
+                        )
                match["context"] = context_msgs
            except Exception:
                match["context"] = []