feat(whatsapp): add voice message transcription via OpenAI/Groq Whisper

Automatically transcribe WhatsApp voice messages using OpenAI Whisper or Groq. Configurable via transcriptionProvider and transcriptionApiKey. Config: "whatsapp": { "transcriptionProvider": "openai", "transcriptionApiKey": "sk-..." }
2026-05-24 02:22:52 +00:00 · 2026-03-26 21:46:31 -05:00 · 2026-03-26 21:46:31 -05:00 · db50dd8a77
commit db50dd8a77
parent e7d371ec1e
3 changed files with 51 additions and 10 deletions
--- a/nanobot/channels/base.py
+++ b/nanobot/channels/base.py
@ -37,13 +37,17 @@ class BaseChannel(ABC):
        self._running = False

    async def transcribe_audio(self, file_path: str | Path) -> str:
-        """Transcribe an audio file via Groq Whisper. Returns empty string on failure."""
+        """Transcribe an audio file via Whisper (OpenAI or Groq). Returns empty string on failure."""
        if not self.transcription_api_key:
            return ""
        try:
-            from nanobot.providers.transcription import GroqTranscriptionProvider
-
-            provider = GroqTranscriptionProvider(api_key=self.transcription_api_key)
+            provider_name = getattr(self, "transcription_provider", "groq")
+            if provider_name == "openai":
+                from nanobot.providers.transcription import OpenAITranscriptionProvider
+                provider = OpenAITranscriptionProvider(api_key=self.transcription_api_key)
+            else:
+                from nanobot.providers.transcription import GroqTranscriptionProvider
+                provider = GroqTranscriptionProvider(api_key=self.transcription_api_key)
            return await provider.transcribe(file_path)
        except Exception as e:
            logger.warning("{}: audio transcription failed: {}", self.name, e)
--- a/nanobot/channels/whatsapp.py
+++ b/nanobot/channels/whatsapp.py
@ -26,6 +26,8 @@ class WhatsAppConfig(Base):
    bridge_url: str = "ws://localhost:3001"
    bridge_token: str = ""
    allow_from: list[str] = Field(default_factory=list)
+    transcription_provider: str = "openai"  # openai or groq
+    transcription_api_key: str = ""
    group_policy: Literal["open", "mention"] = "open"  # "open" responds to all, "mention" only when @mentioned


@ -51,6 +53,8 @@ class WhatsAppChannel(BaseChannel):
        self._ws = None
        self._connected = False
        self._processed_message_ids: OrderedDict[str, None] = OrderedDict()
+        self.transcription_api_key = config.transcription_api_key
+        self.transcription_provider = config.transcription_provider

    async def login(self, force: bool = False) -> bool:
        """
@ -203,11 +207,16 @@ class WhatsAppChannel(BaseChannel):

            # Handle voice transcription if it's a voice message
            if content == "[Voice Message]":
-                logger.info(
-                    "Voice message received from {}, but direct download from bridge is not yet supported.",
-                    sender_id,
-                )
-                content = "[Voice Message: Transcription not available for WhatsApp yet]"
+                if media_paths:
+                    logger.info("Transcribing voice message from {}...", sender_id)
+                    transcription = await self.transcribe_audio(media_paths[0])
+                    if transcription:
+                        content = transcription
+                        logger.info("Transcribed voice from {}: {}...", sender_id, transcription[:50])
+                    else:
+                        content = "[Voice Message: Transcription failed]"
+                else:
+                    content = "[Voice Message: Audio not available]"

            # Extract media paths (images/documents/videos downloaded by the bridge)
            media_paths = data.get("media") or []
--- a/nanobot/providers/transcription.py
+++ b/nanobot/providers/transcription.py
@ -1,8 +1,36 @@
-"""Voice transcription provider using Groq."""
+"""Voice transcription providers (Groq and OpenAI Whisper)."""

 import os
 from pathlib import Path

+
+class OpenAITranscriptionProvider:
+    """Voice transcription provider using OpenAI's Whisper API."""
+
+    def __init__(self, api_key: str | None = None):
+        self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
+        self.api_url = "https://api.openai.com/v1/audio/transcriptions"
+
+    async def transcribe(self, file_path: str | Path) -> str:
+        if not self.api_key:
+            return ""
+        path = Path(file_path)
+        if not path.exists():
+            return ""
+        try:
+            import httpx
+            async with httpx.AsyncClient() as client:
+                with open(path, "rb") as f:
+                    files = {"file": (path.name, f), "model": (None, "whisper-1")}
+                    headers = {"Authorization": f"Bearer {self.api_key}"}
+                    response = await client.post(
+                        self.api_url, headers=headers, files=files, timeout=60.0,
+                    )
+                    response.raise_for_status()
+                    return response.json().get("text", "")
+        except Exception:
+            return ""
+
 import httpx
 from loguru import logger