mirror of
https://github.com/HKUDS/nanobot.git
synced 2026-04-28 05:45:51 +00:00
Merge PR #2530: feat: unify voice message transcription via OpenAI/Groq Whisper
feat: unify voice message transcription via OpenAI/Groq Whisper
This commit is contained in:
commit
4c6a4321e0
@ -900,7 +900,7 @@ IMAP_PASSWORD=your-password-here
|
|||||||
### Providers
|
### Providers
|
||||||
|
|
||||||
> [!TIP]
|
> [!TIP]
|
||||||
> - **Groq** provides free voice transcription via Whisper. If configured, Telegram voice messages will be automatically transcribed.
|
> - **Voice transcription**: Voice messages (Telegram, WhatsApp) are automatically transcribed using Whisper. By default Groq is used (free tier). Set `"transcriptionProvider": "openai"` under `channels` to use OpenAI Whisper instead — the API key is picked from the matching provider config.
|
||||||
> - **MiniMax Coding Plan**: Exclusive discount links for the nanobot community: [Overseas](https://platform.minimax.io/subscribe/coding-plan?code=9txpdXw04g&source=link) · [Mainland China](https://platform.minimaxi.com/subscribe/token-plan?code=GILTJpMTqZ&source=link)
|
> - **MiniMax Coding Plan**: Exclusive discount links for the nanobot community: [Overseas](https://platform.minimax.io/subscribe/coding-plan?code=9txpdXw04g&source=link) · [Mainland China](https://platform.minimaxi.com/subscribe/token-plan?code=GILTJpMTqZ&source=link)
|
||||||
> - **MiniMax (Mainland China)**: If your API key is from MiniMax's mainland China platform (minimaxi.com), set `"apiBase": "https://api.minimaxi.com/v1"` in your minimax provider config.
|
> - **MiniMax (Mainland China)**: If your API key is from MiniMax's mainland China platform (minimaxi.com), set `"apiBase": "https://api.minimaxi.com/v1"` in your minimax provider config.
|
||||||
> - **VolcEngine / BytePlus Coding Plan**: Use dedicated providers `volcengineCodingPlan` or `byteplusCodingPlan` instead of the pay-per-use `volcengine` / `byteplus` providers.
|
> - **VolcEngine / BytePlus Coding Plan**: Use dedicated providers `volcengineCodingPlan` or `byteplusCodingPlan` instead of the pay-per-use `volcengine` / `byteplus` providers.
|
||||||
@ -916,9 +916,9 @@ IMAP_PASSWORD=your-password-here
|
|||||||
| `byteplus` | LLM (VolcEngine international, pay-per-use) | [Coding Plan](https://www.byteplus.com/en/activity/codingplan?utm_campaign=nanobot&utm_content=nanobot&utm_medium=devrel&utm_source=OWO&utm_term=nanobot) · [byteplus.com](https://www.byteplus.com) |
|
| `byteplus` | LLM (VolcEngine international, pay-per-use) | [Coding Plan](https://www.byteplus.com/en/activity/codingplan?utm_campaign=nanobot&utm_content=nanobot&utm_medium=devrel&utm_source=OWO&utm_term=nanobot) · [byteplus.com](https://www.byteplus.com) |
|
||||||
| `anthropic` | LLM (Claude direct) | [console.anthropic.com](https://console.anthropic.com) |
|
| `anthropic` | LLM (Claude direct) | [console.anthropic.com](https://console.anthropic.com) |
|
||||||
| `azure_openai` | LLM (Azure OpenAI) | [portal.azure.com](https://portal.azure.com) |
|
| `azure_openai` | LLM (Azure OpenAI) | [portal.azure.com](https://portal.azure.com) |
|
||||||
| `openai` | LLM (GPT direct) | [platform.openai.com](https://platform.openai.com) |
|
| `openai` | LLM + Voice transcription (Whisper) | [platform.openai.com](https://platform.openai.com) |
|
||||||
| `deepseek` | LLM (DeepSeek direct) | [platform.deepseek.com](https://platform.deepseek.com) |
|
| `deepseek` | LLM (DeepSeek direct) | [platform.deepseek.com](https://platform.deepseek.com) |
|
||||||
| `groq` | LLM + **Voice transcription** (Whisper) | [console.groq.com](https://console.groq.com) |
|
| `groq` | LLM + Voice transcription (Whisper, default) | [console.groq.com](https://console.groq.com) |
|
||||||
| `minimax` | LLM (MiniMax direct) | [platform.minimaxi.com](https://platform.minimaxi.com) |
|
| `minimax` | LLM (MiniMax direct) | [platform.minimaxi.com](https://platform.minimaxi.com) |
|
||||||
| `gemini` | LLM (Gemini direct) | [aistudio.google.com](https://aistudio.google.com) |
|
| `gemini` | LLM (Gemini direct) | [aistudio.google.com](https://aistudio.google.com) |
|
||||||
| `aihubmix` | LLM (API gateway, access to all models) | [aihubmix.com](https://aihubmix.com) |
|
| `aihubmix` | LLM (API gateway, access to all models) | [aihubmix.com](https://aihubmix.com) |
|
||||||
@ -1233,6 +1233,7 @@ Global settings that apply to all channels. Configure under the `channels` secti
|
|||||||
"sendProgress": true,
|
"sendProgress": true,
|
||||||
"sendToolHints": false,
|
"sendToolHints": false,
|
||||||
"sendMaxRetries": 3,
|
"sendMaxRetries": 3,
|
||||||
|
"transcriptionProvider": "groq",
|
||||||
"telegram": { ... }
|
"telegram": { ... }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1243,6 +1244,7 @@ Global settings that apply to all channels. Configure under the `channels` secti
|
|||||||
| `sendProgress` | `true` | Stream agent's text progress to the channel |
|
| `sendProgress` | `true` | Stream agent's text progress to the channel |
|
||||||
| `sendToolHints` | `false` | Stream tool-call hints (e.g. `read_file("…")`) |
|
| `sendToolHints` | `false` | Stream tool-call hints (e.g. `read_file("…")`) |
|
||||||
| `sendMaxRetries` | `3` | Max delivery attempts per outbound message, including the initial send (0-10 configured, minimum 1 actual attempt) |
|
| `sendMaxRetries` | `3` | Max delivery attempts per outbound message, including the initial send (0-10 configured, minimum 1 actual attempt) |
|
||||||
|
| `transcriptionProvider` | `"groq"` | Voice transcription backend: `"groq"` (free tier, default) or `"openai"`. API key is auto-resolved from the matching provider config. |
|
||||||
|
|
||||||
#### Retry Behavior
|
#### Retry Behavior
|
||||||
|
|
||||||
|
|||||||
@ -22,6 +22,7 @@ class BaseChannel(ABC):
|
|||||||
|
|
||||||
name: str = "base"
|
name: str = "base"
|
||||||
display_name: str = "Base"
|
display_name: str = "Base"
|
||||||
|
transcription_provider: str = "groq"
|
||||||
transcription_api_key: str = ""
|
transcription_api_key: str = ""
|
||||||
|
|
||||||
def __init__(self, config: Any, bus: MessageBus):
|
def __init__(self, config: Any, bus: MessageBus):
|
||||||
@ -37,13 +38,16 @@ class BaseChannel(ABC):
|
|||||||
self._running = False
|
self._running = False
|
||||||
|
|
||||||
async def transcribe_audio(self, file_path: str | Path) -> str:
|
async def transcribe_audio(self, file_path: str | Path) -> str:
|
||||||
"""Transcribe an audio file via Groq Whisper. Returns empty string on failure."""
|
"""Transcribe an audio file via Whisper (OpenAI or Groq). Returns empty string on failure."""
|
||||||
if not self.transcription_api_key:
|
if not self.transcription_api_key:
|
||||||
return ""
|
return ""
|
||||||
try:
|
try:
|
||||||
from nanobot.providers.transcription import GroqTranscriptionProvider
|
if self.transcription_provider == "openai":
|
||||||
|
from nanobot.providers.transcription import OpenAITranscriptionProvider
|
||||||
provider = GroqTranscriptionProvider(api_key=self.transcription_api_key)
|
provider = OpenAITranscriptionProvider(api_key=self.transcription_api_key)
|
||||||
|
else:
|
||||||
|
from nanobot.providers.transcription import GroqTranscriptionProvider
|
||||||
|
provider = GroqTranscriptionProvider(api_key=self.transcription_api_key)
|
||||||
return await provider.transcribe(file_path)
|
return await provider.transcribe(file_path)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("{}: audio transcription failed: {}", self.name, e)
|
logger.warning("{}: audio transcription failed: {}", self.name, e)
|
||||||
|
|||||||
@ -39,7 +39,8 @@ class ChannelManager:
|
|||||||
"""Initialize channels discovered via pkgutil scan + entry_points plugins."""
|
"""Initialize channels discovered via pkgutil scan + entry_points plugins."""
|
||||||
from nanobot.channels.registry import discover_all
|
from nanobot.channels.registry import discover_all
|
||||||
|
|
||||||
groq_key = self.config.providers.groq.api_key
|
transcription_provider = self.config.channels.transcription_provider
|
||||||
|
transcription_key = self._resolve_transcription_key(transcription_provider)
|
||||||
|
|
||||||
for name, cls in discover_all().items():
|
for name, cls in discover_all().items():
|
||||||
section = getattr(self.config.channels, name, None)
|
section = getattr(self.config.channels, name, None)
|
||||||
@ -54,7 +55,8 @@ class ChannelManager:
|
|||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
channel = cls(section, self.bus)
|
channel = cls(section, self.bus)
|
||||||
channel.transcription_api_key = groq_key
|
channel.transcription_provider = transcription_provider
|
||||||
|
channel.transcription_api_key = transcription_key
|
||||||
self.channels[name] = channel
|
self.channels[name] = channel
|
||||||
logger.info("{} channel enabled", cls.display_name)
|
logger.info("{} channel enabled", cls.display_name)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -62,6 +64,15 @@ class ChannelManager:
|
|||||||
|
|
||||||
self._validate_allow_from()
|
self._validate_allow_from()
|
||||||
|
|
||||||
|
def _resolve_transcription_key(self, provider: str) -> str:
|
||||||
|
"""Pick the API key for the configured transcription provider."""
|
||||||
|
try:
|
||||||
|
if provider == "openai":
|
||||||
|
return self.config.providers.openai.api_key
|
||||||
|
return self.config.providers.groq.api_key
|
||||||
|
except AttributeError:
|
||||||
|
return ""
|
||||||
|
|
||||||
def _validate_allow_from(self) -> None:
|
def _validate_allow_from(self) -> None:
|
||||||
for name, ch in self.channels.items():
|
for name, ch in self.channels.items():
|
||||||
if getattr(ch.config, "allow_from", None) == []:
|
if getattr(ch.config, "allow_from", None) == []:
|
||||||
|
|||||||
@ -232,17 +232,22 @@ class WhatsAppChannel(BaseChannel):
|
|||||||
sender_id = user_id.split("@")[0] if "@" in user_id else user_id
|
sender_id = user_id.split("@")[0] if "@" in user_id else user_id
|
||||||
logger.info("Sender {}", sender)
|
logger.info("Sender {}", sender)
|
||||||
|
|
||||||
# Handle voice transcription if it's a voice message
|
|
||||||
if content == "[Voice Message]":
|
|
||||||
logger.info(
|
|
||||||
"Voice message received from {}, but direct download from bridge is not yet supported.",
|
|
||||||
sender_id,
|
|
||||||
)
|
|
||||||
content = "[Voice Message: Transcription not available for WhatsApp yet]"
|
|
||||||
|
|
||||||
# Extract media paths (images/documents/videos downloaded by the bridge)
|
# Extract media paths (images/documents/videos downloaded by the bridge)
|
||||||
media_paths = data.get("media") or []
|
media_paths = data.get("media") or []
|
||||||
|
|
||||||
|
# Handle voice transcription if it's a voice message
|
||||||
|
if content == "[Voice Message]":
|
||||||
|
if media_paths:
|
||||||
|
logger.info("Transcribing voice message from {}...", sender_id)
|
||||||
|
transcription = await self.transcribe_audio(media_paths[0])
|
||||||
|
if transcription:
|
||||||
|
content = transcription
|
||||||
|
logger.info("Transcribed voice from {}: {}...", sender_id, transcription[:50])
|
||||||
|
else:
|
||||||
|
content = "[Voice Message: Transcription failed]"
|
||||||
|
else:
|
||||||
|
content = "[Voice Message: Audio not available]"
|
||||||
|
|
||||||
# Build content tags matching Telegram's pattern: [image: /path] or [file: /path]
|
# Build content tags matching Telegram's pattern: [image: /path] or [file: /path]
|
||||||
if media_paths:
|
if media_paths:
|
||||||
for p in media_paths:
|
for p in media_paths:
|
||||||
|
|||||||
@ -28,6 +28,7 @@ class ChannelsConfig(Base):
|
|||||||
send_progress: bool = True # stream agent's text progress to the channel
|
send_progress: bool = True # stream agent's text progress to the channel
|
||||||
send_tool_hints: bool = False # stream tool-call hints (e.g. read_file("…"))
|
send_tool_hints: bool = False # stream tool-call hints (e.g. read_file("…"))
|
||||||
send_max_retries: int = Field(default=3, ge=0, le=10) # Max delivery attempts (initial send included)
|
send_max_retries: int = Field(default=3, ge=0, le=10) # Max delivery attempts (initial send included)
|
||||||
|
transcription_provider: str = "groq" # Voice transcription backend: "groq" or "openai"
|
||||||
|
|
||||||
|
|
||||||
class DreamConfig(Base):
|
class DreamConfig(Base):
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
"""Voice transcription provider using Groq."""
|
"""Voice transcription providers (Groq and OpenAI Whisper)."""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -7,6 +7,36 @@ import httpx
|
|||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
|
|
||||||
|
class OpenAITranscriptionProvider:
|
||||||
|
"""Voice transcription provider using OpenAI's Whisper API."""
|
||||||
|
|
||||||
|
def __init__(self, api_key: str | None = None):
|
||||||
|
self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
|
||||||
|
self.api_url = "https://api.openai.com/v1/audio/transcriptions"
|
||||||
|
|
||||||
|
async def transcribe(self, file_path: str | Path) -> str:
|
||||||
|
if not self.api_key:
|
||||||
|
logger.warning("OpenAI API key not configured for transcription")
|
||||||
|
return ""
|
||||||
|
path = Path(file_path)
|
||||||
|
if not path.exists():
|
||||||
|
logger.error("Audio file not found: {}", file_path)
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
files = {"file": (path.name, f), "model": (None, "whisper-1")}
|
||||||
|
headers = {"Authorization": f"Bearer {self.api_key}"}
|
||||||
|
response = await client.post(
|
||||||
|
self.api_url, headers=headers, files=files, timeout=60.0,
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json().get("text", "")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("OpenAI transcription error: {}", e)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
class GroqTranscriptionProvider:
|
class GroqTranscriptionProvider:
|
||||||
"""
|
"""
|
||||||
Voice transcription provider using Groq's Whisper API.
|
Voice transcription provider using Groq's Whisper API.
|
||||||
|
|||||||
@ -163,6 +163,53 @@ async def test_group_policy_mention_accepts_mentioned_group_message():
|
|||||||
assert kwargs["sender_id"] == "user"
|
assert kwargs["sender_id"] == "user"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_voice_message_transcription_uses_media_path():
|
||||||
|
"""Voice messages are transcribed when media path is available."""
|
||||||
|
ch = WhatsAppChannel({"enabled": True}, MagicMock())
|
||||||
|
ch.transcription_provider = "openai"
|
||||||
|
ch.transcription_api_key = "sk-test"
|
||||||
|
ch._handle_message = AsyncMock()
|
||||||
|
ch.transcribe_audio = AsyncMock(return_value="Hello world")
|
||||||
|
|
||||||
|
await ch._handle_bridge_message(
|
||||||
|
json.dumps({
|
||||||
|
"type": "message",
|
||||||
|
"id": "v1",
|
||||||
|
"sender": "12345@s.whatsapp.net",
|
||||||
|
"pn": "",
|
||||||
|
"content": "[Voice Message]",
|
||||||
|
"timestamp": 1,
|
||||||
|
"media": ["/tmp/voice.ogg"],
|
||||||
|
})
|
||||||
|
)
|
||||||
|
|
||||||
|
ch.transcribe_audio.assert_awaited_once_with("/tmp/voice.ogg")
|
||||||
|
kwargs = ch._handle_message.await_args.kwargs
|
||||||
|
assert kwargs["content"].startswith("Hello world")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_voice_message_no_media_shows_not_available():
|
||||||
|
"""Voice messages without media produce a fallback placeholder."""
|
||||||
|
ch = WhatsAppChannel({"enabled": True}, MagicMock())
|
||||||
|
ch._handle_message = AsyncMock()
|
||||||
|
|
||||||
|
await ch._handle_bridge_message(
|
||||||
|
json.dumps({
|
||||||
|
"type": "message",
|
||||||
|
"id": "v2",
|
||||||
|
"sender": "12345@s.whatsapp.net",
|
||||||
|
"pn": "",
|
||||||
|
"content": "[Voice Message]",
|
||||||
|
"timestamp": 1,
|
||||||
|
})
|
||||||
|
)
|
||||||
|
|
||||||
|
kwargs = ch._handle_message.await_args.kwargs
|
||||||
|
assert kwargs["content"] == "[Voice Message: Audio not available]"
|
||||||
|
|
||||||
|
|
||||||
def test_load_or_create_bridge_token_persists_generated_secret(tmp_path):
|
def test_load_or_create_bridge_token_persists_generated_secret(tmp_path):
|
||||||
token_path = tmp_path / "whatsapp-auth" / "bridge-token"
|
token_path = tmp_path / "whatsapp-auth" / "bridge-token"
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user