From bc3d734df5416a2303ad323e9dbd3fcc648704f8 Mon Sep 17 00:00:00 2001 From: k Date: Wed, 22 Apr 2026 07:56:35 +0900 Subject: [PATCH] fix: allow specifying transcription language --- docs/configuration.md | 4 +- nanobot/channels/base.py | 3 +- nanobot/channels/manager.py | 3 +- nanobot/config/schema.py | 2 +- nanobot/providers/transcription.py | 17 ++++- tests/channels/test_channel_plugins.py | 87 ++++++++++++++++++++++++-- 6 files changed, 104 insertions(+), 12 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index 96b5fa5b7..dcce6aed6 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -45,7 +45,7 @@ IMAP_PASSWORD=your-password-here ## Providers > [!TIP] -> - **Voice transcription**: Voice messages (Telegram, WhatsApp) are automatically transcribed using Whisper. By default Groq is used (free tier). Set `"transcriptionProvider": "openai"` under `channels` to use OpenAI Whisper instead — the API key is picked from the matching provider config. +> - **Voice transcription**: Voice messages (Telegram, WhatsApp) are automatically transcribed using Whisper. By default Groq is used (free tier). Set `"transcriptionProvider": "openai"` under `channels` to use OpenAI Whisper instead, and optionally set `"transcriptionLanguage": "en"` (or another ISO-639-1 code) for more accurate transcription. The API key is picked from the matching provider config. > - **MiniMax Coding Plan**: Exclusive discount links for the nanobot community: [Overseas](https://platform.minimax.io/subscribe/coding-plan?code=9txpdXw04g&source=link) · [Mainland China](https://platform.minimaxi.com/subscribe/token-plan?code=GILTJpMTqZ&source=link) > - **MiniMax (Mainland China)**: If your API key is from MiniMax's mainland China platform (minimaxi.com), set `"apiBase": "https://api.minimaxi.com/v1"` in your minimax provider config. > - **MiniMax thinking mode**: Use `providers.minimaxAnthropic` when you want `reasoningEffort` / thinking mode. MiniMax exposes that capability through its Anthropic-compatible endpoint, so nanobot keeps it as a separate provider instead of guessing MiniMax-specific thinking parameters on the generic OpenAI-compatible `minimax` endpoint. It uses the same `MINIMAX_API_KEY`. Default Anthropic-compatible base URL: `https://api.minimax.io/anthropic`; for mainland China use `https://api.minimaxi.com/anthropic`. @@ -440,6 +440,7 @@ Global settings that apply to all channels. Configure under the `channels` secti "sendToolHints": false, "sendMaxRetries": 3, "transcriptionProvider": "groq", + "transcriptionLanguage": null, "telegram": { ... } } } @@ -451,6 +452,7 @@ Global settings that apply to all channels. Configure under the `channels` secti | `sendToolHints` | `false` | Stream tool-call hints (e.g. `read_file("…")`) | | `sendMaxRetries` | `3` | Max delivery attempts per outbound message, including the initial send (0-10 configured, minimum 1 actual attempt) | | `transcriptionProvider` | `"groq"` | Voice transcription backend: `"groq"` (free tier, default) or `"openai"`. API key is auto-resolved from the matching provider config. | +| `transcriptionLanguage` | `null` | Optional ISO-639-1 language hint for audio transcription, e.g. `"en"`, `"ko"`, `"ja"`. | ### Retry Behavior diff --git a/nanobot/channels/base.py b/nanobot/channels/base.py index 778ed4d4c..62bcd45c1 100644 --- a/nanobot/channels/base.py +++ b/nanobot/channels/base.py @@ -25,7 +25,7 @@ class BaseChannel(ABC): transcription_provider: str = "groq" transcription_api_key: str = "" transcription_api_base: str = "" - transcription_language: str = "" + transcription_language: str | None = None def __init__(self, config: Any, bus: MessageBus): """ @@ -49,6 +49,7 @@ class BaseChannel(ABC): provider = OpenAITranscriptionProvider( api_key=self.transcription_api_key, api_base=self.transcription_api_base or None, + language=self.transcription_language or None, ) else: from nanobot.providers.transcription import GroqTranscriptionProvider diff --git a/nanobot/channels/manager.py b/nanobot/channels/manager.py index 0f0b7430d..7110311b5 100644 --- a/nanobot/channels/manager.py +++ b/nanobot/channels/manager.py @@ -63,6 +63,7 @@ class ChannelManager: transcription_provider = self.config.channels.transcription_provider transcription_key = self._resolve_transcription_key(transcription_provider) transcription_base = self._resolve_transcription_base(transcription_provider) + transcription_language = self.config.channels.transcription_language for name, cls in discover_all().items(): section = getattr(self.config.channels, name, None) @@ -88,7 +89,7 @@ class ChannelManager: channel.transcription_provider = transcription_provider channel.transcription_api_key = transcription_key channel.transcription_api_base = transcription_base - channel.transcription_language = getattr(self.config.channels, "transcription_language", "") + channel.transcription_language = transcription_language self.channels[name] = channel logger.info("{} channel enabled", cls.display_name) except Exception as e: diff --git a/nanobot/config/schema.py b/nanobot/config/schema.py index a6978ba3e..0c2b1b2ac 100644 --- a/nanobot/config/schema.py +++ b/nanobot/config/schema.py @@ -29,7 +29,7 @@ class ChannelsConfig(Base): send_tool_hints: bool = False # stream tool-call hints (e.g. read_file("…")) send_max_retries: int = Field(default=3, ge=0, le=10) # Max delivery attempts (initial send included) transcription_provider: str = "groq" # Voice transcription backend: "groq" or "openai" - transcription_language: str = "" # Language code for Whisper STT (e.g. "en", "ru", "zh") + transcription_language: str | None = None # Optional ISO-639-1 hint for audio transcription class DreamConfig(Base): diff --git a/nanobot/providers/transcription.py b/nanobot/providers/transcription.py index daeedfbef..969990166 100644 --- a/nanobot/providers/transcription.py +++ b/nanobot/providers/transcription.py @@ -10,13 +10,19 @@ from loguru import logger class OpenAITranscriptionProvider: """Voice transcription provider using OpenAI's Whisper API.""" - def __init__(self, api_key: str | None = None, api_base: str | None = None): + def __init__( + self, + api_key: str | None = None, + api_base: str | None = None, + language: str | None = None, + ): self.api_key = api_key or os.environ.get("OPENAI_API_KEY") self.api_url = ( api_base or os.environ.get("OPENAI_TRANSCRIPTION_BASE_URL") or "https://api.openai.com/v1/audio/transcriptions" ) + self.language = language async def transcribe(self, file_path: str | Path) -> str: if not self.api_key: @@ -30,6 +36,8 @@ class OpenAITranscriptionProvider: async with httpx.AsyncClient() as client: with open(path, "rb") as f: files = {"file": (path.name, f), "model": (None, "whisper-1")} + if self.language: + files["language"] = (None, self.language) headers = {"Authorization": f"Bearer {self.api_key}"} response = await client.post( self.api_url, headers=headers, files=files, timeout=60.0, @@ -48,7 +56,12 @@ class GroqTranscriptionProvider: Groq offers extremely fast transcription with a generous free tier. """ - def __init__(self, api_key: str | None = None, api_base: str | None = None, language: str | None = None): + def __init__( + self, + api_key: str | None = None, + api_base: str | None = None, + language: str | None = None, + ): self.api_key = api_key or os.environ.get("GROQ_API_KEY") self.api_url = api_base or os.environ.get("GROQ_BASE_URL") or "https://api.groq.com/openai/v1/audio/transcriptions" self.language = language diff --git a/tests/channels/test_channel_plugins.py b/tests/channels/test_channel_plugins.py index a6959f937..6abe21f7a 100644 --- a/tests/channels/test_channel_plugins.py +++ b/tests/channels/test_channel_plugins.py @@ -15,7 +15,6 @@ from nanobot.channels.manager import ChannelManager from nanobot.config.schema import ChannelsConfig from nanobot.utils.restart import RestartNotice - # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- @@ -200,8 +199,8 @@ async def test_manager_propagates_groq_transcription_api_base_to_channels(): fake_config = SimpleNamespace( channels=ChannelsConfig.model_validate({ "fakeplugin": {"enabled": True, "allowFrom": ["*"]}, + "transcriptionLanguage": "en", }), - transcription_provider="groq", providers=SimpleNamespace( groq=SimpleNamespace(api_key="groq-key", api_base="http://proxy.local/v1/audio/transcriptions"), openai=SimpleNamespace(api_key="openai-key", api_base="https://api.openai.com/v1/audio/transcriptions"), @@ -223,6 +222,7 @@ async def test_manager_propagates_groq_transcription_api_base_to_channels(): assert channel.transcription_provider == "groq" assert channel.transcription_api_key == "groq-key" assert channel.transcription_api_base == "http://proxy.local/v1/audio/transcriptions" + assert channel.transcription_language == "en" @pytest.mark.asyncio @@ -269,13 +269,15 @@ async def test_base_channel_passes_api_base_to_openai_transcription_provider(): channel.transcription_provider = "openai" channel.transcription_api_key = "k" channel.transcription_api_base = "http://override/v1/audio/transcriptions" + channel.transcription_language = "en" captured: dict[str, object] = {} class _StubOpenAI: - def __init__(self, api_key=None, api_base=None): + def __init__(self, api_key=None, api_base=None, language=None): captured["api_key"] = api_key captured["api_base"] = api_base + captured["language"] = language async def transcribe(self, file_path): return "ok" @@ -286,6 +288,7 @@ async def test_base_channel_passes_api_base_to_openai_transcription_provider(): assert result == "ok" assert captured["api_key"] == "k" assert captured["api_base"] == "http://override/v1/audio/transcriptions" + assert captured["language"] == "en" def test_openai_transcription_provider_honors_api_base_argument(): @@ -300,10 +303,80 @@ def test_openai_transcription_provider_honors_api_base_argument(): assert custom.api_url == "http://override/v1/audio/transcriptions" +@pytest.mark.asyncio +async def test_base_channel_passes_language_to_groq_transcription_provider(): + """BaseChannel.transcribe_audio must forward transcription_language to Groq.""" + from nanobot.providers import transcription as transcription_mod + + channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus()) + channel.transcription_provider = "groq" + channel.transcription_api_key = "k" + channel.transcription_api_base = "http://override/v1/audio/transcriptions" + channel.transcription_language = "ko" + + captured: dict[str, object] = {} + + class _StubGroq: + def __init__(self, api_key=None, api_base=None, language=None): + captured["api_key"] = api_key + captured["api_base"] = api_base + captured["language"] = language + + async def transcribe(self, file_path): + return "ok" + + with patch.object(transcription_mod, "GroqTranscriptionProvider", _StubGroq): + result = await channel.transcribe_audio("/tmp/does-not-matter.wav") + + assert result == "ok" + assert captured["api_key"] == "k" + assert captured["api_base"] == "http://override/v1/audio/transcriptions" + assert captured["language"] == "ko" + + +@pytest.mark.asyncio +async def test_groq_transcription_provider_includes_language(tmp_path): + from nanobot.providers.transcription import GroqTranscriptionProvider + + audio = tmp_path / "sample.wav" + audio.write_bytes(b"audio") + captured: dict[str, object] = {} + + class _Response: + def raise_for_status(self): + return None + + def json(self): + return {"text": "hello"} + + class _AsyncClient: + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return False + + async def post(self, url, headers=None, files=None, timeout=None): + captured["url"] = url + captured["headers"] = headers + captured["files"] = files + captured["timeout"] = timeout + return _Response() + + provider = GroqTranscriptionProvider(api_key="k", language="ko") + + with patch("nanobot.providers.transcription.httpx.AsyncClient", return_value=_AsyncClient()): + result = await provider.transcribe(audio) + + assert result == "hello" + assert captured["files"]["language"] == (None, "ko") + + def test_channels_login_uses_discovered_plugin_class(monkeypatch): + from typer.testing import CliRunner + from nanobot.cli.commands import app from nanobot.config.schema import Config - from typer.testing import CliRunner runner = CliRunner() seen: dict[str, object] = {} @@ -329,9 +402,10 @@ def test_channels_login_uses_discovered_plugin_class(monkeypatch): def test_channels_login_sets_custom_config_path(monkeypatch, tmp_path): + from typer.testing import CliRunner + from nanobot.cli.commands import app from nanobot.config.schema import Config - from typer.testing import CliRunner runner = CliRunner() seen: dict[str, object] = {} @@ -358,9 +432,10 @@ def test_channels_login_sets_custom_config_path(monkeypatch, tmp_path): def test_channels_status_sets_custom_config_path(monkeypatch, tmp_path): + from typer.testing import CliRunner + from nanobot.cli.commands import app from nanobot.config.schema import Config - from typer.testing import CliRunner runner = CliRunner() seen: dict[str, object] = {}