fix(tui): add missing hermes_cli.voice wrapper for gateway RPC

tui_gateway/server.py:3486/3491/3509 imports start_recording, stop_and_transcribe, and speak_text from hermes_cli.voice, but the module never existed (not in git history — never shipped, never deleted). Every voice.record / voice.tts RPC call hit the ImportError branch and the TUI surfaced it as "voice module not available — install audio dependencies" even on boxes with sounddevice / faster-whisper / numpy installed. Adds a thin wrapper on top of tools.voice_mode (recording + transcription) and tools.tts_tool (text-to-speech): - start_recording() — idempotent; stores the active AudioRecorder in a module-global guarded by a Lock so repeat Ctrl+B presses don't fight over the mic. - stop_and_transcribe() — returns None for no-op / no-speech / Whisper-hallucination cases so the TUI's existing "no speech detected" path keeps working unchanged. - speak_text(text) — lazily imports tts_tool (optional provider SDKs stay unloaded until the first /voice tts call), parses the tool's JSON result, and plays the audio via play_audio_file. Paired with the Ctrl+B keybinding fix in the prior commit, the TUI voice pipeline now works end-to-end for the first time.
2026-04-24 00:21:59 +03:00
parent 3504bd401b
commit 0bb460b070
2 changed files with 173 additions and 0 deletions
--- a/hermes_cli/voice.py
+++ b/hermes_cli/voice.py
@@ -0,0 +1,120 @@
+"""Process-wide voice recording + TTS API for the TUI gateway.
+
+Wraps ``tools.voice_mode`` (recording/transcription) and ``tools.tts_tool``
+(text-to-speech) behind idempotent, stateful entry points that the gateway's
+``voice.record`` and ``voice.tts`` JSON-RPC handlers can call from a
+dedicated thread. The gateway imports this module lazily so missing optional
+audio deps (sounddevice, faster-whisper, numpy) surface as an ``ImportError``
+at call time, not at startup.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import threading
+from typing import Optional
+
+from tools.voice_mode import (
+    create_audio_recorder,
+    is_whisper_hallucination,
+    play_audio_file,
+    transcribe_recording,
+)
+
+logger = logging.getLogger(__name__)
+
+_recorder = None
+_recorder_lock = threading.Lock()
+
+
+def start_recording() -> None:
+    """Begin capturing from the default input device.
+
+    Idempotent — calling again while a recording is in progress is a no-op,
+    which matches the TUI's toggle semantics (Ctrl+B starts, Ctrl+B stops).
+    """
+    global _recorder
+
+    with _recorder_lock:
+        if _recorder is not None and getattr(_recorder, "is_recording", False):
+            return
+        rec = create_audio_recorder()
+        # No silence callback: the TUI drives start/stop explicitly via
+        # the voice.record RPC. VAD auto-stop is a CLI-mode feature.
+        rec.start()
+        _recorder = rec
+
+
+def stop_and_transcribe() -> Optional[str]:
+    """Stop the active recording, transcribe it, and return the text.
+
+    Returns ``None`` when no recording is active, when the microphone
+    captured no speech, or when Whisper returned a known hallucination
+    token (silence artefacts like "Thanks for watching!"). The caller
+    treats ``None`` as "no speech detected" and leaves the composer
+    untouched.
+    """
+    global _recorder
+
+    with _recorder_lock:
+        rec = _recorder
+        _recorder = None
+
+    if rec is None:
+        return None
+
+    wav_path = rec.stop()
+    if not wav_path:
+        return None
+
+    try:
+        result = transcribe_recording(wav_path)
+    except Exception as e:
+        logger.warning("voice transcription failed: %s", e)
+        return None
+
+    text = (result.get("text") or "").strip()
+    if not text or is_whisper_hallucination(text):
+        return None
+
+    return text
+
+
+def speak_text(text: str) -> None:
+    """Synthesize ``text`` with the configured TTS provider and play it.
+
+    The gateway spawns a daemon thread to call this so the RPC returns
+    immediately. Failures are logged and swallowed — the UI already
+    acknowledged "speaking" by the time we get here.
+    """
+    if not text or not text.strip():
+        return
+
+    # Lazy import — tts_tool pulls optional provider SDKs (OpenAI,
+    # ElevenLabs, etc.) and config-reading machinery that we don't
+    # want to load at module import time.
+    from tools.tts_tool import text_to_speech_tool
+
+    try:
+        raw = text_to_speech_tool(text)
+    except Exception as e:
+        logger.warning("TTS synthesis failed: %s", e)
+        return
+
+    try:
+        result = json.loads(raw) if isinstance(raw, str) else raw
+    except json.JSONDecodeError:
+        logger.warning("TTS returned non-JSON result")
+        return
+
+    if not isinstance(result, dict):
+        return
+
+    file_path = result.get("file_path")
+    if not file_path:
+        err = result.get("error") or "no file_path in TTS result"
+        logger.warning("TTS succeeded but produced no audio: %s", err)
+        return
+
+    play_audio_file(file_path)
--- a/tests/hermes_cli/test_voice_wrapper.py
+++ b/tests/hermes_cli/test_voice_wrapper.py
@@ -0,0 +1,53 @@
+"""Tests for ``hermes_cli.voice`` — the TUI gateway's voice wrapper.
+
+The module is imported *lazily* by ``tui_gateway/server.py`` so that a
+box with missing audio deps fails at call time (returning a clean RPC
+error) rather than at gateway startup. These tests therefore only
+assert the public contract the gateway depends on: the three symbols
+exist, ``stop_and_transcribe`` is a no-op when nothing is recording,
+and ``speak_text`` tolerates empty input without touching the provider
+stack.
+"""
+
+import os
+import sys
+
+import pytest
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+
+class TestPublicAPI:
+    def test_gateway_symbols_importable(self):
+        """Match the exact import shape tui_gateway/server.py uses."""
+        from hermes_cli.voice import (
+            speak_text,
+            start_recording,
+            stop_and_transcribe,
+        )
+
+        assert callable(start_recording)
+        assert callable(stop_and_transcribe)
+        assert callable(speak_text)
+
+
+class TestStopWithoutStart:
+    def test_returns_none_when_no_recording_active(self, monkeypatch):
+        """Idempotent no-op: stop before start must not raise or touch state."""
+        import hermes_cli.voice as voice
+
+        monkeypatch.setattr(voice, "_recorder", None)
+
+        assert voice.stop_and_transcribe() is None
+
+
+class TestSpeakTextGuards:
+    @pytest.mark.parametrize("text", ["", "   ", "\n\t  "])
+    def test_empty_text_is_noop(self, text):
+        """Empty / whitespace-only text must return without importing tts_tool
+        (the gateway spawns a thread per call, so a no-op on empty input
+        keeps the thread pool from churning on trivial inputs)."""
+        from hermes_cli.voice import speak_text
+
+        # Should simply return None without raising.
+        assert speak_text(text) is None