fix: allow specifying transcription language

2026-05-22 01:22:48 +00:00 · 2026-04-22 07:56:35 +09:00 · 2026-04-22 07:56:35 +09:00 · bc3d734df5
commit bc3d734df5
parent 1835f94d8e
6 changed files with 104 additions and 12 deletions
--- a/docs/configuration.md
+++ b/docs/configuration.md
@ -45,7 +45,7 @@ IMAP_PASSWORD=your-password-here
 ## Providers
 > [!TIP]
-> - **Voice transcription**: Voice messages (Telegram, WhatsApp) are automatically transcribed using Whisper. By default Groq is used (free tier). Set `"transcriptionProvider": "openai"` under `channels` to use OpenAI Whisper instead — the API key is picked from the matching provider config.
+> - **Voice transcription**: Voice messages (Telegram, WhatsApp) are automatically transcribed using Whisper. By default Groq is used (free tier). Set `"transcriptionProvider": "openai"` under `channels` to use OpenAI Whisper instead, and optionally set `"transcriptionLanguage": "en"` (or another ISO-639-1 code) for more accurate transcription. The API key is picked from the matching provider config.
 > - **MiniMax Coding Plan**: Exclusive discount links for the nanobot community: [Overseas](https://platform.minimax.io/subscribe/coding-plan?code=9txpdXw04g&source=link) · [Mainland China](https://platform.minimaxi.com/subscribe/token-plan?code=GILTJpMTqZ&source=link)
 > - **MiniMax (Mainland China)**: If your API key is from MiniMax's mainland China platform (minimaxi.com), set `"apiBase": "https://api.minimaxi.com/v1"` in your minimax provider config.
 > - **MiniMax thinking mode**: Use `providers.minimaxAnthropic` when you want `reasoningEffort` / thinking mode. MiniMax exposes that capability through its Anthropic-compatible endpoint, so nanobot keeps it as a separate provider instead of guessing MiniMax-specific thinking parameters on the generic OpenAI-compatible `minimax` endpoint. It uses the same `MINIMAX_API_KEY`. Default Anthropic-compatible base URL: `https://api.minimax.io/anthropic`; for mainland China use `https://api.minimaxi.com/anthropic`.
@ -440,6 +440,7 @@ Global settings that apply to all channels. Configure under the `channels` secti
    "sendToolHints": false,
    "sendMaxRetries": 3,
    "transcriptionProvider": "groq",
    "transcriptionLanguage": null,
    "telegram": { ... }
  }
 }
@ -451,6 +452,7 @@ Global settings that apply to all channels. Configure under the `channels` secti
 | `sendToolHints` | `false` | Stream tool-call hints (e.g. `read_file("…")`) |
 | `sendMaxRetries` | `3` | Max delivery attempts per outbound message, including the initial send (0-10 configured, minimum 1 actual attempt) |
 | `transcriptionProvider` | `"groq"` | Voice transcription backend: `"groq"` (free tier, default) or `"openai"`. API key is auto-resolved from the matching provider config. |
 | `transcriptionLanguage` | `null` | Optional ISO-639-1 language hint for audio transcription, e.g. `"en"`, `"ko"`, `"ja"`. |
 ### Retry Behavior
--- a/nanobot/channels/base.py
+++ b/nanobot/channels/base.py
@ -25,7 +25,7 @@ class BaseChannel(ABC):
    transcription_provider: str = "groq"
    transcription_api_key: str = ""
    transcription_api_base: str = ""
-    transcription_language: str = ""
+    transcription_language: str | None = None
    def __init__(self, config: Any, bus: MessageBus):
        """
@ -49,6 +49,7 @@ class BaseChannel(ABC):
                provider = OpenAITranscriptionProvider(
                    api_key=self.transcription_api_key,
                    api_base=self.transcription_api_base or None,
                    language=self.transcription_language or None,
                )
            else:
                from nanobot.providers.transcription import GroqTranscriptionProvider
--- a/nanobot/channels/manager.py
+++ b/nanobot/channels/manager.py
@ -63,6 +63,7 @@ class ChannelManager:
        transcription_provider = self.config.channels.transcription_provider
        transcription_key = self._resolve_transcription_key(transcription_provider)
        transcription_base = self._resolve_transcription_base(transcription_provider)
        transcription_language = self.config.channels.transcription_language
        for name, cls in discover_all().items():
            section = getattr(self.config.channels, name, None)
@ -88,7 +89,7 @@ class ChannelManager:
                channel.transcription_provider = transcription_provider
                channel.transcription_api_key = transcription_key
                channel.transcription_api_base = transcription_base
-                channel.transcription_language = getattr(self.config.channels, "transcription_language", "")
+                channel.transcription_language = transcription_language
                self.channels[name] = channel
                logger.info("{} channel enabled", cls.display_name)
            except Exception as e:
--- a/nanobot/config/schema.py
+++ b/nanobot/config/schema.py
@ -29,7 +29,7 @@ class ChannelsConfig(Base):
    send_tool_hints: bool = False  # stream tool-call hints (e.g. read_file("…"))
    send_max_retries: int = Field(default=3, ge=0, le=10)  # Max delivery attempts (initial send included)
    transcription_provider: str = "groq"  # Voice transcription backend: "groq" or "openai"
-    transcription_language: str = ""  # Language code for Whisper STT (e.g. "en", "ru", "zh")
+    transcription_language: str | None = None  # Optional ISO-639-1 hint for audio transcription
 class DreamConfig(Base):
--- a/nanobot/providers/transcription.py
+++ b/nanobot/providers/transcription.py
@ -10,13 +10,19 @@ from loguru import logger
 class OpenAITranscriptionProvider:
    """Voice transcription provider using OpenAI's Whisper API."""
-    def __init__(self, api_key: str | None = None, api_base: str | None = None):
+    def __init__(
        self,
        api_key: str | None = None,
        api_base: str | None = None,
        language: str | None = None,
    ):
        self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
        self.api_url = (
            api_base
            or os.environ.get("OPENAI_TRANSCRIPTION_BASE_URL")
            or "https://api.openai.com/v1/audio/transcriptions"
        )
        self.language = language
    async def transcribe(self, file_path: str | Path) -> str:
        if not self.api_key:
@ -30,6 +36,8 @@ class OpenAITranscriptionProvider:
            async with httpx.AsyncClient() as client:
                with open(path, "rb") as f:
                    files = {"file": (path.name, f), "model": (None, "whisper-1")}
                    if self.language:
                        files["language"] = (None, self.language)
                    headers = {"Authorization": f"Bearer {self.api_key}"}
                    response = await client.post(
                        self.api_url, headers=headers, files=files, timeout=60.0,
@ -48,7 +56,12 @@ class GroqTranscriptionProvider:
    Groq offers extremely fast transcription with a generous free tier.
    """
-    def __init__(self, api_key: str | None = None, api_base: str | None = None, language: str | None = None):
+    def __init__(
        self,
        api_key: str | None = None,
        api_base: str | None = None,
        language: str | None = None,
    ):
        self.api_key = api_key or os.environ.get("GROQ_API_KEY")
        self.api_url = api_base or os.environ.get("GROQ_BASE_URL") or "https://api.groq.com/openai/v1/audio/transcriptions"
        self.language = language
--- a/tests/channels/test_channel_plugins.py
+++ b/tests/channels/test_channel_plugins.py
@ -15,7 +15,6 @@ from nanobot.channels.manager import ChannelManager
 from nanobot.config.schema import ChannelsConfig
 from nanobot.utils.restart import RestartNotice
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
@ -200,8 +199,8 @@ async def test_manager_propagates_groq_transcription_api_base_to_channels():
    fake_config = SimpleNamespace(
        channels=ChannelsConfig.model_validate({
            "fakeplugin": {"enabled": True, "allowFrom": ["*"]},
            "transcriptionLanguage": "en",
        }),
        transcription_provider="groq",
        providers=SimpleNamespace(
            groq=SimpleNamespace(api_key="groq-key", api_base="http://proxy.local/v1/audio/transcriptions"),
            openai=SimpleNamespace(api_key="openai-key", api_base="https://api.openai.com/v1/audio/transcriptions"),
@ -223,6 +222,7 @@ async def test_manager_propagates_groq_transcription_api_base_to_channels():
    assert channel.transcription_provider == "groq"
    assert channel.transcription_api_key == "groq-key"
    assert channel.transcription_api_base == "http://proxy.local/v1/audio/transcriptions"
    assert channel.transcription_language == "en"
@pytest.mark.asyncio
@ -269,13 +269,15 @@ async def test_base_channel_passes_api_base_to_openai_transcription_provider():
    channel.transcription_provider = "openai"
    channel.transcription_api_key = "k"
    channel.transcription_api_base = "http://override/v1/audio/transcriptions"
    channel.transcription_language = "en"
    captured: dict[str, object] = {}
    class _StubOpenAI:
-        def __init__(self, api_key=None, api_base=None):
+        def __init__(self, api_key=None, api_base=None, language=None):
            captured["api_key"] = api_key
            captured["api_base"] = api_base
            captured["language"] = language
        async def transcribe(self, file_path):
            return "ok"
@ -286,6 +288,7 @@ async def test_base_channel_passes_api_base_to_openai_transcription_provider():
    assert result == "ok"
    assert captured["api_key"] == "k"
    assert captured["api_base"] == "http://override/v1/audio/transcriptions"
    assert captured["language"] == "en"
 def test_openai_transcription_provider_honors_api_base_argument():
@ -300,10 +303,80 @@ def test_openai_transcription_provider_honors_api_base_argument():
    assert custom.api_url == "http://override/v1/audio/transcriptions"
@pytest.mark.asyncio
 async def test_base_channel_passes_language_to_groq_transcription_provider():
    """BaseChannel.transcribe_audio must forward transcription_language to Groq."""
    from nanobot.providers import transcription as transcription_mod
    channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus())
    channel.transcription_provider = "groq"
    channel.transcription_api_key = "k"
    channel.transcription_api_base = "http://override/v1/audio/transcriptions"
    channel.transcription_language = "ko"
    captured: dict[str, object] = {}
    class _StubGroq:
        def __init__(self, api_key=None, api_base=None, language=None):
            captured["api_key"] = api_key
            captured["api_base"] = api_base
            captured["language"] = language
        async def transcribe(self, file_path):
            return "ok"
    with patch.object(transcription_mod, "GroqTranscriptionProvider", _StubGroq):
        result = await channel.transcribe_audio("/tmp/does-not-matter.wav")
    assert result == "ok"
    assert captured["api_key"] == "k"
    assert captured["api_base"] == "http://override/v1/audio/transcriptions"
    assert captured["language"] == "ko"
@pytest.mark.asyncio
 async def test_groq_transcription_provider_includes_language(tmp_path):
    from nanobot.providers.transcription import GroqTranscriptionProvider
    audio = tmp_path / "sample.wav"
    audio.write_bytes(b"audio")
    captured: dict[str, object] = {}
    class _Response:
        def raise_for_status(self):
            return None
        def json(self):
            return {"text": "hello"}
    class _AsyncClient:
        async def __aenter__(self):
            return self
        async def __aexit__(self, exc_type, exc, tb):
            return False
        async def post(self, url, headers=None, files=None, timeout=None):
            captured["url"] = url
            captured["headers"] = headers
            captured["files"] = files
            captured["timeout"] = timeout
            return _Response()
    provider = GroqTranscriptionProvider(api_key="k", language="ko")
    with patch("nanobot.providers.transcription.httpx.AsyncClient", return_value=_AsyncClient()):
        result = await provider.transcribe(audio)
    assert result == "hello"
    assert captured["files"]["language"] == (None, "ko")
 def test_channels_login_uses_discovered_plugin_class(monkeypatch):
    from typer.testing import CliRunner
    from nanobot.cli.commands import app
    from nanobot.config.schema import Config
    from typer.testing import CliRunner
    runner = CliRunner()
    seen: dict[str, object] = {}
@ -329,9 +402,10 @@ def test_channels_login_uses_discovered_plugin_class(monkeypatch):
 def test_channels_login_sets_custom_config_path(monkeypatch, tmp_path):
    from typer.testing import CliRunner
    from nanobot.cli.commands import app
    from nanobot.config.schema import Config
    from typer.testing import CliRunner
    runner = CliRunner()
    seen: dict[str, object] = {}
@ -358,9 +432,10 @@ def test_channels_login_sets_custom_config_path(monkeypatch, tmp_path):
 def test_channels_status_sets_custom_config_path(monkeypatch, tmp_path):
    from typer.testing import CliRunner
    from nanobot.cli.commands import app
    from nanobot.config.schema import Config
    from typer.testing import CliRunner
    runner = CliRunner()
    seen: dict[str, object] = {}