feat(whatsapp): add voice message transcription via OpenAI/Groq Whisper

Automatically transcribe WhatsApp voice messages using OpenAI Whisper or Groq. Configurable via transcriptionProvider and transcriptionApiKey. Config: "whatsapp": { "transcriptionProvider": "openai", "transcriptionApiKey": "sk-..." }
2026-04-27 05:15:51 +00:00 · 2026-03-26 21:46:31 -05:00 · 2026-03-26 21:46:31 -05:00 · db50dd8a77
commit db50dd8a77
parent e7d371ec1e
3 changed files with 51 additions and 10 deletions
--- a/nanobot/channels/base.py
+++ b/nanobot/channels/base.py
@ -37,13 +37,17 @@ class BaseChannel(ABC):
        self._running = False
    async def transcribe_audio(self, file_path: str | Path) -> str:
-        """Transcribe an audio file via Groq Whisper. Returns empty string on failure."""
+        """Transcribe an audio file via Whisper (OpenAI or Groq). Returns empty string on failure."""
        if not self.transcription_api_key:
            return ""
        try:
-            from nanobot.providers.transcription import GroqTranscriptionProvider
+            provider_name = getattr(self, "transcription_provider", "groq")
-
+            if provider_name == "openai":
-            provider = GroqTranscriptionProvider(api_key=self.transcription_api_key)
+                from nanobot.providers.transcription import OpenAITranscriptionProvider
                provider = OpenAITranscriptionProvider(api_key=self.transcription_api_key)
            else:
                from nanobot.providers.transcription import GroqTranscriptionProvider
                provider = GroqTranscriptionProvider(api_key=self.transcription_api_key)
            return await provider.transcribe(file_path)
        except Exception as e:
            logger.warning("{}: audio transcription failed: {}", self.name, e)
--- a/nanobot/channels/whatsapp.py
+++ b/nanobot/channels/whatsapp.py
@ -26,6 +26,8 @@ class WhatsAppConfig(Base):
    bridge_url: str = "ws://localhost:3001"
    bridge_token: str = ""
    allow_from: list[str] = Field(default_factory=list)
    transcription_provider: str = "openai"  # openai or groq
    transcription_api_key: str = ""
    group_policy: Literal["open", "mention"] = "open"  # "open" responds to all, "mention" only when @mentioned
@ -51,6 +53,8 @@ class WhatsAppChannel(BaseChannel):
        self._ws = None
        self._connected = False
        self._processed_message_ids: OrderedDict[str, None] = OrderedDict()
        self.transcription_api_key = config.transcription_api_key
        self.transcription_provider = config.transcription_provider
    async def login(self, force: bool = False) -> bool:
        """
@ -203,11 +207,16 @@ class WhatsAppChannel(BaseChannel):
            # Handle voice transcription if it's a voice message
            if content == "[Voice Message]":
-                logger.info(
+                if media_paths:
-                    "Voice message received from {}, but direct download from bridge is not yet supported.",
+                    logger.info("Transcribing voice message from {}...", sender_id)
-                    sender_id,
+                    transcription = await self.transcribe_audio(media_paths[0])
-                )
+                    if transcription:
-                content = "[Voice Message: Transcription not available for WhatsApp yet]"
+                        content = transcription
                        logger.info("Transcribed voice from {}: {}...", sender_id, transcription[:50])
                    else:
                        content = "[Voice Message: Transcription failed]"
                else:
                    content = "[Voice Message: Audio not available]"
            # Extract media paths (images/documents/videos downloaded by the bridge)
            media_paths = data.get("media") or []
--- a/nanobot/providers/transcription.py
+++ b/nanobot/providers/transcription.py
@ -1,8 +1,36 @@
-"""Voice transcription provider using Groq."""
+"""Voice transcription providers (Groq and OpenAI Whisper)."""
 import os
 from pathlib import Path
 class OpenAITranscriptionProvider:
    """Voice transcription provider using OpenAI's Whisper API."""
    def __init__(self, api_key: str | None = None):
        self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
        self.api_url = "https://api.openai.com/v1/audio/transcriptions"
    async def transcribe(self, file_path: str | Path) -> str:
        if not self.api_key:
            return ""
        path = Path(file_path)
        if not path.exists():
            return ""
        try:
            import httpx
            async with httpx.AsyncClient() as client:
                with open(path, "rb") as f:
                    files = {"file": (path.name, f), "model": (None, "whisper-1")}
                    headers = {"Authorization": f"Bearer {self.api_key}"}
                    response = await client.post(
                        self.api_url, headers=headers, files=files, timeout=60.0,
                    )
                    response.raise_for_status()
                    return response.json().get("text", "")
        except Exception:
            return ""
 import httpx
 from loguru import logger