From bc3d734df5416a2303ad323e9dbd3fcc648704f8 Mon Sep 17 00:00:00 2001
From: k <lahuman@daum.net>
Date: Wed, 22 Apr 2026 07:56:35 +0900
Subject: [PATCH] fix: allow specifying transcription language

---
 docs/configuration.md                  |  4 +-
 nanobot/channels/base.py               |  3 +-
 nanobot/channels/manager.py            |  3 +-
 nanobot/config/schema.py               |  2 +-
 nanobot/providers/transcription.py     | 17 ++++-
 tests/channels/test_channel_plugins.py | 87 ++++++++++++++++++++++++--
 6 files changed, 104 insertions(+), 12 deletions(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index 96b5fa5b7..dcce6aed6 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -45,7 +45,7 @@ IMAP_PASSWORD=your-password-here
 ## Providers
 
 > [!TIP]
-> - **Voice transcription**: Voice messages (Telegram, WhatsApp) are automatically transcribed using Whisper. By default Groq is used (free tier). Set `"transcriptionProvider": "openai"` under `channels` to use OpenAI Whisper instead — the API key is picked from the matching provider config.
+> - **Voice transcription**: Voice messages (Telegram, WhatsApp) are automatically transcribed using Whisper. By default Groq is used (free tier). Set `"transcriptionProvider": "openai"` under `channels` to use OpenAI Whisper instead, and optionally set `"transcriptionLanguage": "en"` (or another ISO-639-1 code) for more accurate transcription. The API key is picked from the matching provider config.
 > - **MiniMax Coding Plan**: Exclusive discount links for the nanobot community: [Overseas](https://platform.minimax.io/subscribe/coding-plan?code=9txpdXw04g&source=link) · [Mainland China](https://platform.minimaxi.com/subscribe/token-plan?code=GILTJpMTqZ&source=link)
 > - **MiniMax (Mainland China)**: If your API key is from MiniMax's mainland China platform (minimaxi.com), set `"apiBase": "https://api.minimaxi.com/v1"` in your minimax provider config.
 > - **MiniMax thinking mode**: Use `providers.minimaxAnthropic` when you want `reasoningEffort` / thinking mode. MiniMax exposes that capability through its Anthropic-compatible endpoint, so nanobot keeps it as a separate provider instead of guessing MiniMax-specific thinking parameters on the generic OpenAI-compatible `minimax` endpoint. It uses the same `MINIMAX_API_KEY`. Default Anthropic-compatible base URL: `https://api.minimax.io/anthropic`; for mainland China use `https://api.minimaxi.com/anthropic`.
@@ -440,6 +440,7 @@ Global settings that apply to all channels. Configure under the `channels` secti
     "sendToolHints": false,
     "sendMaxRetries": 3,
     "transcriptionProvider": "groq",
+    "transcriptionLanguage": null,
     "telegram": { ... }
   }
 }
@@ -451,6 +452,7 @@ Global settings that apply to all channels. Configure under the `channels` secti
 | `sendToolHints` | `false` | Stream tool-call hints (e.g. `read_file("…")`) |
 | `sendMaxRetries` | `3` | Max delivery attempts per outbound message, including the initial send (0-10 configured, minimum 1 actual attempt) |
 | `transcriptionProvider` | `"groq"` | Voice transcription backend: `"groq"` (free tier, default) or `"openai"`. API key is auto-resolved from the matching provider config. |
+| `transcriptionLanguage` | `null` | Optional ISO-639-1 language hint for audio transcription, e.g. `"en"`, `"ko"`, `"ja"`. |
 
 ### Retry Behavior
 
diff --git a/nanobot/channels/base.py b/nanobot/channels/base.py
index 778ed4d4c..62bcd45c1 100644
--- a/nanobot/channels/base.py
+++ b/nanobot/channels/base.py
@@ -25,7 +25,7 @@ class BaseChannel(ABC):
     transcription_provider: str = "groq"
     transcription_api_key: str = ""
     transcription_api_base: str = ""
-    transcription_language: str = ""
+    transcription_language: str | None = None
 
     def __init__(self, config: Any, bus: MessageBus):
         """
@@ -49,6 +49,7 @@ class BaseChannel(ABC):
                 provider = OpenAITranscriptionProvider(
                     api_key=self.transcription_api_key,
                     api_base=self.transcription_api_base or None,
+                    language=self.transcription_language or None,
                 )
             else:
                 from nanobot.providers.transcription import GroqTranscriptionProvider
diff --git a/nanobot/channels/manager.py b/nanobot/channels/manager.py
index 0f0b7430d..7110311b5 100644
--- a/nanobot/channels/manager.py
+++ b/nanobot/channels/manager.py
@@ -63,6 +63,7 @@ class ChannelManager:
         transcription_provider = self.config.channels.transcription_provider
         transcription_key = self._resolve_transcription_key(transcription_provider)
         transcription_base = self._resolve_transcription_base(transcription_provider)
+        transcription_language = self.config.channels.transcription_language
 
         for name, cls in discover_all().items():
             section = getattr(self.config.channels, name, None)
@@ -88,7 +89,7 @@ class ChannelManager:
                 channel.transcription_provider = transcription_provider
                 channel.transcription_api_key = transcription_key
                 channel.transcription_api_base = transcription_base
-                channel.transcription_language = getattr(self.config.channels, "transcription_language", "")
+                channel.transcription_language = transcription_language
                 self.channels[name] = channel
                 logger.info("{} channel enabled", cls.display_name)
             except Exception as e:
diff --git a/nanobot/config/schema.py b/nanobot/config/schema.py
index a6978ba3e..0c2b1b2ac 100644
--- a/nanobot/config/schema.py
+++ b/nanobot/config/schema.py
@@ -29,7 +29,7 @@ class ChannelsConfig(Base):
     send_tool_hints: bool = False  # stream tool-call hints (e.g. read_file("…"))
     send_max_retries: int = Field(default=3, ge=0, le=10)  # Max delivery attempts (initial send included)
     transcription_provider: str = "groq"  # Voice transcription backend: "groq" or "openai"
-    transcription_language: str = ""  # Language code for Whisper STT (e.g. "en", "ru", "zh")
+    transcription_language: str | None = None  # Optional ISO-639-1 hint for audio transcription
 
 
 class DreamConfig(Base):
diff --git a/nanobot/providers/transcription.py b/nanobot/providers/transcription.py
index daeedfbef..969990166 100644
--- a/nanobot/providers/transcription.py
+++ b/nanobot/providers/transcription.py
@@ -10,13 +10,19 @@ from loguru import logger
 class OpenAITranscriptionProvider:
     """Voice transcription provider using OpenAI's Whisper API."""
 
-    def __init__(self, api_key: str | None = None, api_base: str | None = None):
+    def __init__(
+        self,
+        api_key: str | None = None,
+        api_base: str | None = None,
+        language: str | None = None,
+    ):
         self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
         self.api_url = (
             api_base
             or os.environ.get("OPENAI_TRANSCRIPTION_BASE_URL")
             or "https://api.openai.com/v1/audio/transcriptions"
         )
+        self.language = language
 
     async def transcribe(self, file_path: str | Path) -> str:
         if not self.api_key:
@@ -30,6 +36,8 @@ class OpenAITranscriptionProvider:
             async with httpx.AsyncClient() as client:
                 with open(path, "rb") as f:
                     files = {"file": (path.name, f), "model": (None, "whisper-1")}
+                    if self.language:
+                        files["language"] = (None, self.language)
                     headers = {"Authorization": f"Bearer {self.api_key}"}
                     response = await client.post(
                         self.api_url, headers=headers, files=files, timeout=60.0,
@@ -48,7 +56,12 @@ class GroqTranscriptionProvider:
     Groq offers extremely fast transcription with a generous free tier.
     """
 
-    def __init__(self, api_key: str | None = None, api_base: str | None = None, language: str | None = None):
+    def __init__(
+        self,
+        api_key: str | None = None,
+        api_base: str | None = None,
+        language: str | None = None,
+    ):
         self.api_key = api_key or os.environ.get("GROQ_API_KEY")
         self.api_url = api_base or os.environ.get("GROQ_BASE_URL") or "https://api.groq.com/openai/v1/audio/transcriptions"
         self.language = language
diff --git a/tests/channels/test_channel_plugins.py b/tests/channels/test_channel_plugins.py
index a6959f937..6abe21f7a 100644
--- a/tests/channels/test_channel_plugins.py
+++ b/tests/channels/test_channel_plugins.py
@@ -15,7 +15,6 @@ from nanobot.channels.manager import ChannelManager
 from nanobot.config.schema import ChannelsConfig
 from nanobot.utils.restart import RestartNotice
 
-
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
@@ -200,8 +199,8 @@ async def test_manager_propagates_groq_transcription_api_base_to_channels():
     fake_config = SimpleNamespace(
         channels=ChannelsConfig.model_validate({
             "fakeplugin": {"enabled": True, "allowFrom": ["*"]},
+            "transcriptionLanguage": "en",
         }),
-        transcription_provider="groq",
         providers=SimpleNamespace(
             groq=SimpleNamespace(api_key="groq-key", api_base="http://proxy.local/v1/audio/transcriptions"),
             openai=SimpleNamespace(api_key="openai-key", api_base="https://api.openai.com/v1/audio/transcriptions"),
@@ -223,6 +222,7 @@ async def test_manager_propagates_groq_transcription_api_base_to_channels():
     assert channel.transcription_provider == "groq"
     assert channel.transcription_api_key == "groq-key"
     assert channel.transcription_api_base == "http://proxy.local/v1/audio/transcriptions"
+    assert channel.transcription_language == "en"
 
 
 @pytest.mark.asyncio
@@ -269,13 +269,15 @@ async def test_base_channel_passes_api_base_to_openai_transcription_provider():
     channel.transcription_provider = "openai"
     channel.transcription_api_key = "k"
     channel.transcription_api_base = "http://override/v1/audio/transcriptions"
+    channel.transcription_language = "en"
 
     captured: dict[str, object] = {}
 
     class _StubOpenAI:
-        def __init__(self, api_key=None, api_base=None):
+        def __init__(self, api_key=None, api_base=None, language=None):
             captured["api_key"] = api_key
             captured["api_base"] = api_base
+            captured["language"] = language
 
         async def transcribe(self, file_path):
             return "ok"
@@ -286,6 +288,7 @@ async def test_base_channel_passes_api_base_to_openai_transcription_provider():
     assert result == "ok"
     assert captured["api_key"] == "k"
     assert captured["api_base"] == "http://override/v1/audio/transcriptions"
+    assert captured["language"] == "en"
 
 
 def test_openai_transcription_provider_honors_api_base_argument():
@@ -300,10 +303,80 @@ def test_openai_transcription_provider_honors_api_base_argument():
     assert custom.api_url == "http://override/v1/audio/transcriptions"
 
 
+@pytest.mark.asyncio
+async def test_base_channel_passes_language_to_groq_transcription_provider():
+    """BaseChannel.transcribe_audio must forward transcription_language to Groq."""
+    from nanobot.providers import transcription as transcription_mod
+
+    channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus())
+    channel.transcription_provider = "groq"
+    channel.transcription_api_key = "k"
+    channel.transcription_api_base = "http://override/v1/audio/transcriptions"
+    channel.transcription_language = "ko"
+
+    captured: dict[str, object] = {}
+
+    class _StubGroq:
+        def __init__(self, api_key=None, api_base=None, language=None):
+            captured["api_key"] = api_key
+            captured["api_base"] = api_base
+            captured["language"] = language
+
+        async def transcribe(self, file_path):
+            return "ok"
+
+    with patch.object(transcription_mod, "GroqTranscriptionProvider", _StubGroq):
+        result = await channel.transcribe_audio("/tmp/does-not-matter.wav")
+
+    assert result == "ok"
+    assert captured["api_key"] == "k"
+    assert captured["api_base"] == "http://override/v1/audio/transcriptions"
+    assert captured["language"] == "ko"
+
+
+@pytest.mark.asyncio
+async def test_groq_transcription_provider_includes_language(tmp_path):
+    from nanobot.providers.transcription import GroqTranscriptionProvider
+
+    audio = tmp_path / "sample.wav"
+    audio.write_bytes(b"audio")
+    captured: dict[str, object] = {}
+
+    class _Response:
+        def raise_for_status(self):
+            return None
+
+        def json(self):
+            return {"text": "hello"}
+
+    class _AsyncClient:
+        async def __aenter__(self):
+            return self
+
+        async def __aexit__(self, exc_type, exc, tb):
+            return False
+
+        async def post(self, url, headers=None, files=None, timeout=None):
+            captured["url"] = url
+            captured["headers"] = headers
+            captured["files"] = files
+            captured["timeout"] = timeout
+            return _Response()
+
+    provider = GroqTranscriptionProvider(api_key="k", language="ko")
+
+    with patch("nanobot.providers.transcription.httpx.AsyncClient", return_value=_AsyncClient()):
+        result = await provider.transcribe(audio)
+
+    assert result == "hello"
+    assert captured["files"]["language"] == (None, "ko")
+
+
 def test_channels_login_uses_discovered_plugin_class(monkeypatch):
+    from typer.testing import CliRunner
+
     from nanobot.cli.commands import app
     from nanobot.config.schema import Config
-    from typer.testing import CliRunner
 
     runner = CliRunner()
     seen: dict[str, object] = {}
@@ -329,9 +402,10 @@ def test_channels_login_uses_discovered_plugin_class(monkeypatch):
 
 
 def test_channels_login_sets_custom_config_path(monkeypatch, tmp_path):
+    from typer.testing import CliRunner
+
     from nanobot.cli.commands import app
     from nanobot.config.schema import Config
-    from typer.testing import CliRunner
 
     runner = CliRunner()
     seen: dict[str, object] = {}
@@ -358,9 +432,10 @@ def test_channels_login_sets_custom_config_path(monkeypatch, tmp_path):
 
 
 def test_channels_status_sets_custom_config_path(monkeypatch, tmp_path):
+    from typer.testing import CliRunner
+
     from nanobot.cli.commands import app
     from nanobot.config.schema import Config
-    from typer.testing import CliRunner
 
     runner = CliRunner()
     seen: dict[str, object] = {}