refactor: unify voice transcription config across all channels

- Move transcriptionProvider to global channels config (not per-channel)
- ChannelManager auto-resolves API key from matching provider config
- BaseChannel gets transcription_provider attribute, no more getattr hack
- Remove redundant transcription fields from WhatsAppConfig
- Update README: document transcriptionProvider, update provider table

Made-with: Cursor
This commit is contained in:
Xubin Ren 2026-04-06 06:07:30 +00:00
parent 7b7a3e5748
commit 35dde8a30e
6 changed files with 21 additions and 15 deletions

View File

@ -900,7 +900,7 @@ IMAP_PASSWORD=your-password-here
### Providers
> [!TIP]
> - **Groq** provides free voice transcription via Whisper. If configured, Telegram voice messages will be automatically transcribed.
> - **Voice transcription**: Voice messages (Telegram, WhatsApp) are automatically transcribed using Whisper. By default Groq is used (free tier). Set `"transcriptionProvider": "openai"` under `channels` to use OpenAI Whisper instead — the API key is picked from the matching provider config.
> - **MiniMax Coding Plan**: Exclusive discount links for the nanobot community: [Overseas](https://platform.minimax.io/subscribe/coding-plan?code=9txpdXw04g&source=link) · [Mainland China](https://platform.minimaxi.com/subscribe/token-plan?code=GILTJpMTqZ&source=link)
> - **MiniMax (Mainland China)**: If your API key is from MiniMax's mainland China platform (minimaxi.com), set `"apiBase": "https://api.minimaxi.com/v1"` in your minimax provider config.
> - **VolcEngine / BytePlus Coding Plan**: Use dedicated providers `volcengineCodingPlan` or `byteplusCodingPlan` instead of the pay-per-use `volcengine` / `byteplus` providers.
@ -916,9 +916,9 @@ IMAP_PASSWORD=your-password-here
| `byteplus` | LLM (VolcEngine international, pay-per-use) | [Coding Plan](https://www.byteplus.com/en/activity/codingplan?utm_campaign=nanobot&utm_content=nanobot&utm_medium=devrel&utm_source=OWO&utm_term=nanobot) · [byteplus.com](https://www.byteplus.com) |
| `anthropic` | LLM (Claude direct) | [console.anthropic.com](https://console.anthropic.com) |
| `azure_openai` | LLM (Azure OpenAI) | [portal.azure.com](https://portal.azure.com) |
| `openai` | LLM (GPT direct) | [platform.openai.com](https://platform.openai.com) |
| `openai` | LLM + Voice transcription (Whisper) | [platform.openai.com](https://platform.openai.com) |
| `deepseek` | LLM (DeepSeek direct) | [platform.deepseek.com](https://platform.deepseek.com) |
| `groq` | LLM + **Voice transcription** (Whisper) | [console.groq.com](https://console.groq.com) |
| `groq` | LLM + Voice transcription (Whisper, default) | [console.groq.com](https://console.groq.com) |
| `minimax` | LLM (MiniMax direct) | [platform.minimaxi.com](https://platform.minimaxi.com) |
| `gemini` | LLM (Gemini direct) | [aistudio.google.com](https://aistudio.google.com) |
| `aihubmix` | LLM (API gateway, access to all models) | [aihubmix.com](https://aihubmix.com) |
@ -1233,6 +1233,7 @@ Global settings that apply to all channels. Configure under the `channels` secti
"sendProgress": true,
"sendToolHints": false,
"sendMaxRetries": 3,
"transcriptionProvider": "groq",
"telegram": { ... }
}
}
@ -1243,6 +1244,7 @@ Global settings that apply to all channels. Configure under the `channels` secti
| `sendProgress` | `true` | Stream agent's text progress to the channel |
| `sendToolHints` | `false` | Stream tool-call hints (e.g. `read_file("…")`) |
| `sendMaxRetries` | `3` | Max delivery attempts per outbound message, including the initial send (0-10 configured, minimum 1 actual attempt) |
| `transcriptionProvider` | `"groq"` | Voice transcription backend: `"groq"` (free tier, default) or `"openai"`. API key is auto-resolved from the matching provider config. |
#### Retry Behavior

View File

@ -22,6 +22,7 @@ class BaseChannel(ABC):
name: str = "base"
display_name: str = "Base"
transcription_provider: str = "groq"
transcription_api_key: str = ""
def __init__(self, config: Any, bus: MessageBus):
@ -41,8 +42,7 @@ class BaseChannel(ABC):
if not self.transcription_api_key:
return ""
try:
provider_name = getattr(self, "transcription_provider", "groq")
if provider_name == "openai":
if self.transcription_provider == "openai":
from nanobot.providers.transcription import OpenAITranscriptionProvider
provider = OpenAITranscriptionProvider(api_key=self.transcription_api_key)
else:

View File

@ -39,7 +39,8 @@ class ChannelManager:
"""Initialize channels discovered via pkgutil scan + entry_points plugins."""
from nanobot.channels.registry import discover_all
groq_key = self.config.providers.groq.api_key
transcription_provider = self.config.channels.transcription_provider
transcription_key = self._resolve_transcription_key(transcription_provider)
for name, cls in discover_all().items():
section = getattr(self.config.channels, name, None)
@ -54,7 +55,8 @@ class ChannelManager:
continue
try:
channel = cls(section, self.bus)
channel.transcription_api_key = groq_key
channel.transcription_provider = transcription_provider
channel.transcription_api_key = transcription_key
self.channels[name] = channel
logger.info("{} channel enabled", cls.display_name)
except Exception as e:
@ -62,6 +64,12 @@ class ChannelManager:
self._validate_allow_from()
def _resolve_transcription_key(self, provider: str) -> str:
"""Pick the API key for the configured transcription provider."""
if provider == "openai":
return self.config.providers.openai.api_key
return self.config.providers.groq.api_key
def _validate_allow_from(self) -> None:
for name, ch in self.channels.items():
if getattr(ch.config, "allow_from", None) == []:

View File

@ -27,8 +27,6 @@ class WhatsAppConfig(Base):
bridge_url: str = "ws://localhost:3001"
bridge_token: str = ""
allow_from: list[str] = Field(default_factory=list)
transcription_provider: str = "openai" # openai or groq
transcription_api_key: str = ""
group_policy: Literal["open", "mention"] = "open" # "open" responds to all, "mention" only when @mentioned
@ -77,8 +75,6 @@ class WhatsAppChannel(BaseChannel):
self._ws = None
self._connected = False
self._processed_message_ids: OrderedDict[str, None] = OrderedDict()
self.transcription_api_key = config.transcription_api_key
self.transcription_provider = config.transcription_provider
self._bridge_token: str | None = None
def _effective_bridge_token(self) -> str:

View File

@ -28,6 +28,7 @@ class ChannelsConfig(Base):
send_progress: bool = True # stream agent's text progress to the channel
send_tool_hints: bool = False # stream tool-call hints (e.g. read_file("…"))
send_max_retries: int = Field(default=3, ge=0, le=10) # Max delivery attempts (initial send included)
transcription_provider: str = "groq" # Voice transcription backend: "groq" or "openai"
class DreamConfig(Base):

View File

@ -166,10 +166,9 @@ async def test_group_policy_mention_accepts_mentioned_group_message():
@pytest.mark.asyncio
async def test_voice_message_transcription_uses_media_path():
"""Voice messages are transcribed when media path is available."""
ch = WhatsAppChannel(
{"enabled": True, "transcriptionProvider": "openai", "transcriptionApiKey": "sk-test"},
MagicMock(),
)
ch = WhatsAppChannel({"enabled": True}, MagicMock())
ch.transcription_provider = "openai"
ch.transcription_api_key = "sk-test"
ch._handle_message = AsyncMock()
ch.transcribe_audio = AsyncMock(return_value="Hello world")