diff --git a/README.md b/README.md index 1858e1672..e42a6efe9 100644 --- a/README.md +++ b/README.md @@ -900,7 +900,7 @@ IMAP_PASSWORD=your-password-here ### Providers > [!TIP] -> - **Groq** provides free voice transcription via Whisper. If configured, Telegram voice messages will be automatically transcribed. +> - **Voice transcription**: Voice messages (Telegram, WhatsApp) are automatically transcribed using Whisper. By default Groq is used (free tier). Set `"transcriptionProvider": "openai"` under `channels` to use OpenAI Whisper instead — the API key is picked from the matching provider config. > - **MiniMax Coding Plan**: Exclusive discount links for the nanobot community: [Overseas](https://platform.minimax.io/subscribe/coding-plan?code=9txpdXw04g&source=link) · [Mainland China](https://platform.minimaxi.com/subscribe/token-plan?code=GILTJpMTqZ&source=link) > - **MiniMax (Mainland China)**: If your API key is from MiniMax's mainland China platform (minimaxi.com), set `"apiBase": "https://api.minimaxi.com/v1"` in your minimax provider config. > - **VolcEngine / BytePlus Coding Plan**: Use dedicated providers `volcengineCodingPlan` or `byteplusCodingPlan` instead of the pay-per-use `volcengine` / `byteplus` providers. @@ -916,9 +916,9 @@ IMAP_PASSWORD=your-password-here | `byteplus` | LLM (VolcEngine international, pay-per-use) | [Coding Plan](https://www.byteplus.com/en/activity/codingplan?utm_campaign=nanobot&utm_content=nanobot&utm_medium=devrel&utm_source=OWO&utm_term=nanobot) · [byteplus.com](https://www.byteplus.com) | | `anthropic` | LLM (Claude direct) | [console.anthropic.com](https://console.anthropic.com) | | `azure_openai` | LLM (Azure OpenAI) | [portal.azure.com](https://portal.azure.com) | -| `openai` | LLM (GPT direct) | [platform.openai.com](https://platform.openai.com) | +| `openai` | LLM + Voice transcription (Whisper) | [platform.openai.com](https://platform.openai.com) | | `deepseek` | LLM (DeepSeek direct) | [platform.deepseek.com](https://platform.deepseek.com) | -| `groq` | LLM + **Voice transcription** (Whisper) | [console.groq.com](https://console.groq.com) | +| `groq` | LLM + Voice transcription (Whisper, default) | [console.groq.com](https://console.groq.com) | | `minimax` | LLM (MiniMax direct) | [platform.minimaxi.com](https://platform.minimaxi.com) | | `gemini` | LLM (Gemini direct) | [aistudio.google.com](https://aistudio.google.com) | | `aihubmix` | LLM (API gateway, access to all models) | [aihubmix.com](https://aihubmix.com) | @@ -1233,6 +1233,7 @@ Global settings that apply to all channels. Configure under the `channels` secti "sendProgress": true, "sendToolHints": false, "sendMaxRetries": 3, + "transcriptionProvider": "groq", "telegram": { ... } } } @@ -1243,6 +1244,7 @@ Global settings that apply to all channels. Configure under the `channels` secti | `sendProgress` | `true` | Stream agent's text progress to the channel | | `sendToolHints` | `false` | Stream tool-call hints (e.g. `read_file("…")`) | | `sendMaxRetries` | `3` | Max delivery attempts per outbound message, including the initial send (0-10 configured, minimum 1 actual attempt) | +| `transcriptionProvider` | `"groq"` | Voice transcription backend: `"groq"` (free tier, default) or `"openai"`. API key is auto-resolved from the matching provider config. | #### Retry Behavior diff --git a/nanobot/channels/base.py b/nanobot/channels/base.py index 86e991344..dd29c0851 100644 --- a/nanobot/channels/base.py +++ b/nanobot/channels/base.py @@ -22,6 +22,7 @@ class BaseChannel(ABC): name: str = "base" display_name: str = "Base" + transcription_provider: str = "groq" transcription_api_key: str = "" def __init__(self, config: Any, bus: MessageBus): @@ -37,13 +38,16 @@ class BaseChannel(ABC): self._running = False async def transcribe_audio(self, file_path: str | Path) -> str: - """Transcribe an audio file via Groq Whisper. Returns empty string on failure.""" + """Transcribe an audio file via Whisper (OpenAI or Groq). Returns empty string on failure.""" if not self.transcription_api_key: return "" try: - from nanobot.providers.transcription import GroqTranscriptionProvider - - provider = GroqTranscriptionProvider(api_key=self.transcription_api_key) + if self.transcription_provider == "openai": + from nanobot.providers.transcription import OpenAITranscriptionProvider + provider = OpenAITranscriptionProvider(api_key=self.transcription_api_key) + else: + from nanobot.providers.transcription import GroqTranscriptionProvider + provider = GroqTranscriptionProvider(api_key=self.transcription_api_key) return await provider.transcribe(file_path) except Exception as e: logger.warning("{}: audio transcription failed: {}", self.name, e) diff --git a/nanobot/channels/manager.py b/nanobot/channels/manager.py index 1f26f4d7a..aaec5e335 100644 --- a/nanobot/channels/manager.py +++ b/nanobot/channels/manager.py @@ -39,7 +39,8 @@ class ChannelManager: """Initialize channels discovered via pkgutil scan + entry_points plugins.""" from nanobot.channels.registry import discover_all - groq_key = self.config.providers.groq.api_key + transcription_provider = self.config.channels.transcription_provider + transcription_key = self._resolve_transcription_key(transcription_provider) for name, cls in discover_all().items(): section = getattr(self.config.channels, name, None) @@ -54,7 +55,8 @@ class ChannelManager: continue try: channel = cls(section, self.bus) - channel.transcription_api_key = groq_key + channel.transcription_provider = transcription_provider + channel.transcription_api_key = transcription_key self.channels[name] = channel logger.info("{} channel enabled", cls.display_name) except Exception as e: @@ -62,6 +64,15 @@ class ChannelManager: self._validate_allow_from() + def _resolve_transcription_key(self, provider: str) -> str: + """Pick the API key for the configured transcription provider.""" + try: + if provider == "openai": + return self.config.providers.openai.api_key + return self.config.providers.groq.api_key + except AttributeError: + return "" + def _validate_allow_from(self) -> None: for name, ch in self.channels.items(): if getattr(ch.config, "allow_from", None) == []: diff --git a/nanobot/channels/whatsapp.py b/nanobot/channels/whatsapp.py index a788dd727..1b46d6e97 100644 --- a/nanobot/channels/whatsapp.py +++ b/nanobot/channels/whatsapp.py @@ -232,17 +232,22 @@ class WhatsAppChannel(BaseChannel): sender_id = user_id.split("@")[0] if "@" in user_id else user_id logger.info("Sender {}", sender) - # Handle voice transcription if it's a voice message - if content == "[Voice Message]": - logger.info( - "Voice message received from {}, but direct download from bridge is not yet supported.", - sender_id, - ) - content = "[Voice Message: Transcription not available for WhatsApp yet]" - # Extract media paths (images/documents/videos downloaded by the bridge) media_paths = data.get("media") or [] + # Handle voice transcription if it's a voice message + if content == "[Voice Message]": + if media_paths: + logger.info("Transcribing voice message from {}...", sender_id) + transcription = await self.transcribe_audio(media_paths[0]) + if transcription: + content = transcription + logger.info("Transcribed voice from {}: {}...", sender_id, transcription[:50]) + else: + content = "[Voice Message: Transcription failed]" + else: + content = "[Voice Message: Audio not available]" + # Build content tags matching Telegram's pattern: [image: /path] or [file: /path] if media_paths: for p in media_paths: diff --git a/nanobot/config/schema.py b/nanobot/config/schema.py index dfb91c528..f147434e7 100644 --- a/nanobot/config/schema.py +++ b/nanobot/config/schema.py @@ -28,6 +28,7 @@ class ChannelsConfig(Base): send_progress: bool = True # stream agent's text progress to the channel send_tool_hints: bool = False # stream tool-call hints (e.g. read_file("…")) send_max_retries: int = Field(default=3, ge=0, le=10) # Max delivery attempts (initial send included) + transcription_provider: str = "groq" # Voice transcription backend: "groq" or "openai" class DreamConfig(Base): diff --git a/nanobot/providers/transcription.py b/nanobot/providers/transcription.py index 1c8cb6a3f..aca9693ee 100644 --- a/nanobot/providers/transcription.py +++ b/nanobot/providers/transcription.py @@ -1,4 +1,4 @@ -"""Voice transcription provider using Groq.""" +"""Voice transcription providers (Groq and OpenAI Whisper).""" import os from pathlib import Path @@ -7,6 +7,36 @@ import httpx from loguru import logger +class OpenAITranscriptionProvider: + """Voice transcription provider using OpenAI's Whisper API.""" + + def __init__(self, api_key: str | None = None): + self.api_key = api_key or os.environ.get("OPENAI_API_KEY") + self.api_url = "https://api.openai.com/v1/audio/transcriptions" + + async def transcribe(self, file_path: str | Path) -> str: + if not self.api_key: + logger.warning("OpenAI API key not configured for transcription") + return "" + path = Path(file_path) + if not path.exists(): + logger.error("Audio file not found: {}", file_path) + return "" + try: + async with httpx.AsyncClient() as client: + with open(path, "rb") as f: + files = {"file": (path.name, f), "model": (None, "whisper-1")} + headers = {"Authorization": f"Bearer {self.api_key}"} + response = await client.post( + self.api_url, headers=headers, files=files, timeout=60.0, + ) + response.raise_for_status() + return response.json().get("text", "") + except Exception as e: + logger.error("OpenAI transcription error: {}", e) + return "" + + class GroqTranscriptionProvider: """ Voice transcription provider using Groq's Whisper API. diff --git a/tests/channels/test_whatsapp_channel.py b/tests/channels/test_whatsapp_channel.py index 8223fdff3..f285e4dbe 100644 --- a/tests/channels/test_whatsapp_channel.py +++ b/tests/channels/test_whatsapp_channel.py @@ -163,6 +163,53 @@ async def test_group_policy_mention_accepts_mentioned_group_message(): assert kwargs["sender_id"] == "user" +@pytest.mark.asyncio +async def test_voice_message_transcription_uses_media_path(): + """Voice messages are transcribed when media path is available.""" + ch = WhatsAppChannel({"enabled": True}, MagicMock()) + ch.transcription_provider = "openai" + ch.transcription_api_key = "sk-test" + ch._handle_message = AsyncMock() + ch.transcribe_audio = AsyncMock(return_value="Hello world") + + await ch._handle_bridge_message( + json.dumps({ + "type": "message", + "id": "v1", + "sender": "12345@s.whatsapp.net", + "pn": "", + "content": "[Voice Message]", + "timestamp": 1, + "media": ["/tmp/voice.ogg"], + }) + ) + + ch.transcribe_audio.assert_awaited_once_with("/tmp/voice.ogg") + kwargs = ch._handle_message.await_args.kwargs + assert kwargs["content"].startswith("Hello world") + + +@pytest.mark.asyncio +async def test_voice_message_no_media_shows_not_available(): + """Voice messages without media produce a fallback placeholder.""" + ch = WhatsAppChannel({"enabled": True}, MagicMock()) + ch._handle_message = AsyncMock() + + await ch._handle_bridge_message( + json.dumps({ + "type": "message", + "id": "v2", + "sender": "12345@s.whatsapp.net", + "pn": "", + "content": "[Voice Message]", + "timestamp": 1, + }) + ) + + kwargs = ch._handle_message.await_args.kwargs + assert kwargs["content"] == "[Voice Message: Audio not available]" + + def test_load_or_create_bridge_token_persists_generated_secret(tmp_path): token_path = tmp_path / "whatsapp-auth" / "bridge-token"