fix: allow specifying transcription language

This commit is contained in:
k 2026-04-22 07:56:35 +09:00 committed by chengyongru
parent 1835f94d8e
commit bc3d734df5
6 changed files with 104 additions and 12 deletions

View File

@ -45,7 +45,7 @@ IMAP_PASSWORD=your-password-here
## Providers ## Providers
> [!TIP] > [!TIP]
> - **Voice transcription**: Voice messages (Telegram, WhatsApp) are automatically transcribed using Whisper. By default Groq is used (free tier). Set `"transcriptionProvider": "openai"` under `channels` to use OpenAI Whisper instead — the API key is picked from the matching provider config. > - **Voice transcription**: Voice messages (Telegram, WhatsApp) are automatically transcribed using Whisper. By default Groq is used (free tier). Set `"transcriptionProvider": "openai"` under `channels` to use OpenAI Whisper instead, and optionally set `"transcriptionLanguage": "en"` (or another ISO-639-1 code) for more accurate transcription. The API key is picked from the matching provider config.
> - **MiniMax Coding Plan**: Exclusive discount links for the nanobot community: [Overseas](https://platform.minimax.io/subscribe/coding-plan?code=9txpdXw04g&source=link) · [Mainland China](https://platform.minimaxi.com/subscribe/token-plan?code=GILTJpMTqZ&source=link) > - **MiniMax Coding Plan**: Exclusive discount links for the nanobot community: [Overseas](https://platform.minimax.io/subscribe/coding-plan?code=9txpdXw04g&source=link) · [Mainland China](https://platform.minimaxi.com/subscribe/token-plan?code=GILTJpMTqZ&source=link)
> - **MiniMax (Mainland China)**: If your API key is from MiniMax's mainland China platform (minimaxi.com), set `"apiBase": "https://api.minimaxi.com/v1"` in your minimax provider config. > - **MiniMax (Mainland China)**: If your API key is from MiniMax's mainland China platform (minimaxi.com), set `"apiBase": "https://api.minimaxi.com/v1"` in your minimax provider config.
> - **MiniMax thinking mode**: Use `providers.minimaxAnthropic` when you want `reasoningEffort` / thinking mode. MiniMax exposes that capability through its Anthropic-compatible endpoint, so nanobot keeps it as a separate provider instead of guessing MiniMax-specific thinking parameters on the generic OpenAI-compatible `minimax` endpoint. It uses the same `MINIMAX_API_KEY`. Default Anthropic-compatible base URL: `https://api.minimax.io/anthropic`; for mainland China use `https://api.minimaxi.com/anthropic`. > - **MiniMax thinking mode**: Use `providers.minimaxAnthropic` when you want `reasoningEffort` / thinking mode. MiniMax exposes that capability through its Anthropic-compatible endpoint, so nanobot keeps it as a separate provider instead of guessing MiniMax-specific thinking parameters on the generic OpenAI-compatible `minimax` endpoint. It uses the same `MINIMAX_API_KEY`. Default Anthropic-compatible base URL: `https://api.minimax.io/anthropic`; for mainland China use `https://api.minimaxi.com/anthropic`.
@ -440,6 +440,7 @@ Global settings that apply to all channels. Configure under the `channels` secti
"sendToolHints": false, "sendToolHints": false,
"sendMaxRetries": 3, "sendMaxRetries": 3,
"transcriptionProvider": "groq", "transcriptionProvider": "groq",
"transcriptionLanguage": null,
"telegram": { ... } "telegram": { ... }
} }
} }
@ -451,6 +452,7 @@ Global settings that apply to all channels. Configure under the `channels` secti
| `sendToolHints` | `false` | Stream tool-call hints (e.g. `read_file("…")`) | | `sendToolHints` | `false` | Stream tool-call hints (e.g. `read_file("…")`) |
| `sendMaxRetries` | `3` | Max delivery attempts per outbound message, including the initial send (0-10 configured, minimum 1 actual attempt) | | `sendMaxRetries` | `3` | Max delivery attempts per outbound message, including the initial send (0-10 configured, minimum 1 actual attempt) |
| `transcriptionProvider` | `"groq"` | Voice transcription backend: `"groq"` (free tier, default) or `"openai"`. API key is auto-resolved from the matching provider config. | | `transcriptionProvider` | `"groq"` | Voice transcription backend: `"groq"` (free tier, default) or `"openai"`. API key is auto-resolved from the matching provider config. |
| `transcriptionLanguage` | `null` | Optional ISO-639-1 language hint for audio transcription, e.g. `"en"`, `"ko"`, `"ja"`. |
### Retry Behavior ### Retry Behavior

View File

@ -25,7 +25,7 @@ class BaseChannel(ABC):
transcription_provider: str = "groq" transcription_provider: str = "groq"
transcription_api_key: str = "" transcription_api_key: str = ""
transcription_api_base: str = "" transcription_api_base: str = ""
transcription_language: str = "" transcription_language: str | None = None
def __init__(self, config: Any, bus: MessageBus): def __init__(self, config: Any, bus: MessageBus):
""" """
@ -49,6 +49,7 @@ class BaseChannel(ABC):
provider = OpenAITranscriptionProvider( provider = OpenAITranscriptionProvider(
api_key=self.transcription_api_key, api_key=self.transcription_api_key,
api_base=self.transcription_api_base or None, api_base=self.transcription_api_base or None,
language=self.transcription_language or None,
) )
else: else:
from nanobot.providers.transcription import GroqTranscriptionProvider from nanobot.providers.transcription import GroqTranscriptionProvider

View File

@ -63,6 +63,7 @@ class ChannelManager:
transcription_provider = self.config.channels.transcription_provider transcription_provider = self.config.channels.transcription_provider
transcription_key = self._resolve_transcription_key(transcription_provider) transcription_key = self._resolve_transcription_key(transcription_provider)
transcription_base = self._resolve_transcription_base(transcription_provider) transcription_base = self._resolve_transcription_base(transcription_provider)
transcription_language = self.config.channels.transcription_language
for name, cls in discover_all().items(): for name, cls in discover_all().items():
section = getattr(self.config.channels, name, None) section = getattr(self.config.channels, name, None)
@ -88,7 +89,7 @@ class ChannelManager:
channel.transcription_provider = transcription_provider channel.transcription_provider = transcription_provider
channel.transcription_api_key = transcription_key channel.transcription_api_key = transcription_key
channel.transcription_api_base = transcription_base channel.transcription_api_base = transcription_base
channel.transcription_language = getattr(self.config.channels, "transcription_language", "") channel.transcription_language = transcription_language
self.channels[name] = channel self.channels[name] = channel
logger.info("{} channel enabled", cls.display_name) logger.info("{} channel enabled", cls.display_name)
except Exception as e: except Exception as e:

View File

@ -29,7 +29,7 @@ class ChannelsConfig(Base):
send_tool_hints: bool = False # stream tool-call hints (e.g. read_file("…")) send_tool_hints: bool = False # stream tool-call hints (e.g. read_file("…"))
send_max_retries: int = Field(default=3, ge=0, le=10) # Max delivery attempts (initial send included) send_max_retries: int = Field(default=3, ge=0, le=10) # Max delivery attempts (initial send included)
transcription_provider: str = "groq" # Voice transcription backend: "groq" or "openai" transcription_provider: str = "groq" # Voice transcription backend: "groq" or "openai"
transcription_language: str = "" # Language code for Whisper STT (e.g. "en", "ru", "zh") transcription_language: str | None = None # Optional ISO-639-1 hint for audio transcription
class DreamConfig(Base): class DreamConfig(Base):

View File

@ -10,13 +10,19 @@ from loguru import logger
class OpenAITranscriptionProvider: class OpenAITranscriptionProvider:
"""Voice transcription provider using OpenAI's Whisper API.""" """Voice transcription provider using OpenAI's Whisper API."""
def __init__(self, api_key: str | None = None, api_base: str | None = None): def __init__(
self,
api_key: str | None = None,
api_base: str | None = None,
language: str | None = None,
):
self.api_key = api_key or os.environ.get("OPENAI_API_KEY") self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
self.api_url = ( self.api_url = (
api_base api_base
or os.environ.get("OPENAI_TRANSCRIPTION_BASE_URL") or os.environ.get("OPENAI_TRANSCRIPTION_BASE_URL")
or "https://api.openai.com/v1/audio/transcriptions" or "https://api.openai.com/v1/audio/transcriptions"
) )
self.language = language
async def transcribe(self, file_path: str | Path) -> str: async def transcribe(self, file_path: str | Path) -> str:
if not self.api_key: if not self.api_key:
@ -30,6 +36,8 @@ class OpenAITranscriptionProvider:
async with httpx.AsyncClient() as client: async with httpx.AsyncClient() as client:
with open(path, "rb") as f: with open(path, "rb") as f:
files = {"file": (path.name, f), "model": (None, "whisper-1")} files = {"file": (path.name, f), "model": (None, "whisper-1")}
if self.language:
files["language"] = (None, self.language)
headers = {"Authorization": f"Bearer {self.api_key}"} headers = {"Authorization": f"Bearer {self.api_key}"}
response = await client.post( response = await client.post(
self.api_url, headers=headers, files=files, timeout=60.0, self.api_url, headers=headers, files=files, timeout=60.0,
@ -48,7 +56,12 @@ class GroqTranscriptionProvider:
Groq offers extremely fast transcription with a generous free tier. Groq offers extremely fast transcription with a generous free tier.
""" """
def __init__(self, api_key: str | None = None, api_base: str | None = None, language: str | None = None): def __init__(
self,
api_key: str | None = None,
api_base: str | None = None,
language: str | None = None,
):
self.api_key = api_key or os.environ.get("GROQ_API_KEY") self.api_key = api_key or os.environ.get("GROQ_API_KEY")
self.api_url = api_base or os.environ.get("GROQ_BASE_URL") or "https://api.groq.com/openai/v1/audio/transcriptions" self.api_url = api_base or os.environ.get("GROQ_BASE_URL") or "https://api.groq.com/openai/v1/audio/transcriptions"
self.language = language self.language = language

View File

@ -15,7 +15,6 @@ from nanobot.channels.manager import ChannelManager
from nanobot.config.schema import ChannelsConfig from nanobot.config.schema import ChannelsConfig
from nanobot.utils.restart import RestartNotice from nanobot.utils.restart import RestartNotice
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Helpers # Helpers
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@ -200,8 +199,8 @@ async def test_manager_propagates_groq_transcription_api_base_to_channels():
fake_config = SimpleNamespace( fake_config = SimpleNamespace(
channels=ChannelsConfig.model_validate({ channels=ChannelsConfig.model_validate({
"fakeplugin": {"enabled": True, "allowFrom": ["*"]}, "fakeplugin": {"enabled": True, "allowFrom": ["*"]},
"transcriptionLanguage": "en",
}), }),
transcription_provider="groq",
providers=SimpleNamespace( providers=SimpleNamespace(
groq=SimpleNamespace(api_key="groq-key", api_base="http://proxy.local/v1/audio/transcriptions"), groq=SimpleNamespace(api_key="groq-key", api_base="http://proxy.local/v1/audio/transcriptions"),
openai=SimpleNamespace(api_key="openai-key", api_base="https://api.openai.com/v1/audio/transcriptions"), openai=SimpleNamespace(api_key="openai-key", api_base="https://api.openai.com/v1/audio/transcriptions"),
@ -223,6 +222,7 @@ async def test_manager_propagates_groq_transcription_api_base_to_channels():
assert channel.transcription_provider == "groq" assert channel.transcription_provider == "groq"
assert channel.transcription_api_key == "groq-key" assert channel.transcription_api_key == "groq-key"
assert channel.transcription_api_base == "http://proxy.local/v1/audio/transcriptions" assert channel.transcription_api_base == "http://proxy.local/v1/audio/transcriptions"
assert channel.transcription_language == "en"
@pytest.mark.asyncio @pytest.mark.asyncio
@ -269,13 +269,15 @@ async def test_base_channel_passes_api_base_to_openai_transcription_provider():
channel.transcription_provider = "openai" channel.transcription_provider = "openai"
channel.transcription_api_key = "k" channel.transcription_api_key = "k"
channel.transcription_api_base = "http://override/v1/audio/transcriptions" channel.transcription_api_base = "http://override/v1/audio/transcriptions"
channel.transcription_language = "en"
captured: dict[str, object] = {} captured: dict[str, object] = {}
class _StubOpenAI: class _StubOpenAI:
def __init__(self, api_key=None, api_base=None): def __init__(self, api_key=None, api_base=None, language=None):
captured["api_key"] = api_key captured["api_key"] = api_key
captured["api_base"] = api_base captured["api_base"] = api_base
captured["language"] = language
async def transcribe(self, file_path): async def transcribe(self, file_path):
return "ok" return "ok"
@ -286,6 +288,7 @@ async def test_base_channel_passes_api_base_to_openai_transcription_provider():
assert result == "ok" assert result == "ok"
assert captured["api_key"] == "k" assert captured["api_key"] == "k"
assert captured["api_base"] == "http://override/v1/audio/transcriptions" assert captured["api_base"] == "http://override/v1/audio/transcriptions"
assert captured["language"] == "en"
def test_openai_transcription_provider_honors_api_base_argument(): def test_openai_transcription_provider_honors_api_base_argument():
@ -300,10 +303,80 @@ def test_openai_transcription_provider_honors_api_base_argument():
assert custom.api_url == "http://override/v1/audio/transcriptions" assert custom.api_url == "http://override/v1/audio/transcriptions"
@pytest.mark.asyncio
async def test_base_channel_passes_language_to_groq_transcription_provider():
"""BaseChannel.transcribe_audio must forward transcription_language to Groq."""
from nanobot.providers import transcription as transcription_mod
channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus())
channel.transcription_provider = "groq"
channel.transcription_api_key = "k"
channel.transcription_api_base = "http://override/v1/audio/transcriptions"
channel.transcription_language = "ko"
captured: dict[str, object] = {}
class _StubGroq:
def __init__(self, api_key=None, api_base=None, language=None):
captured["api_key"] = api_key
captured["api_base"] = api_base
captured["language"] = language
async def transcribe(self, file_path):
return "ok"
with patch.object(transcription_mod, "GroqTranscriptionProvider", _StubGroq):
result = await channel.transcribe_audio("/tmp/does-not-matter.wav")
assert result == "ok"
assert captured["api_key"] == "k"
assert captured["api_base"] == "http://override/v1/audio/transcriptions"
assert captured["language"] == "ko"
@pytest.mark.asyncio
async def test_groq_transcription_provider_includes_language(tmp_path):
from nanobot.providers.transcription import GroqTranscriptionProvider
audio = tmp_path / "sample.wav"
audio.write_bytes(b"audio")
captured: dict[str, object] = {}
class _Response:
def raise_for_status(self):
return None
def json(self):
return {"text": "hello"}
class _AsyncClient:
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc, tb):
return False
async def post(self, url, headers=None, files=None, timeout=None):
captured["url"] = url
captured["headers"] = headers
captured["files"] = files
captured["timeout"] = timeout
return _Response()
provider = GroqTranscriptionProvider(api_key="k", language="ko")
with patch("nanobot.providers.transcription.httpx.AsyncClient", return_value=_AsyncClient()):
result = await provider.transcribe(audio)
assert result == "hello"
assert captured["files"]["language"] == (None, "ko")
def test_channels_login_uses_discovered_plugin_class(monkeypatch): def test_channels_login_uses_discovered_plugin_class(monkeypatch):
from typer.testing import CliRunner
from nanobot.cli.commands import app from nanobot.cli.commands import app
from nanobot.config.schema import Config from nanobot.config.schema import Config
from typer.testing import CliRunner
runner = CliRunner() runner = CliRunner()
seen: dict[str, object] = {} seen: dict[str, object] = {}
@ -329,9 +402,10 @@ def test_channels_login_uses_discovered_plugin_class(monkeypatch):
def test_channels_login_sets_custom_config_path(monkeypatch, tmp_path): def test_channels_login_sets_custom_config_path(monkeypatch, tmp_path):
from typer.testing import CliRunner
from nanobot.cli.commands import app from nanobot.cli.commands import app
from nanobot.config.schema import Config from nanobot.config.schema import Config
from typer.testing import CliRunner
runner = CliRunner() runner = CliRunner()
seen: dict[str, object] = {} seen: dict[str, object] = {}
@ -358,9 +432,10 @@ def test_channels_login_sets_custom_config_path(monkeypatch, tmp_path):
def test_channels_status_sets_custom_config_path(monkeypatch, tmp_path): def test_channels_status_sets_custom_config_path(monkeypatch, tmp_path):
from typer.testing import CliRunner
from nanobot.cli.commands import app from nanobot.cli.commands import app
from nanobot.config.schema import Config from nanobot.config.schema import Config
from typer.testing import CliRunner
runner = CliRunner() runner = CliRunner()
seen: dict[str, object] = {} seen: dict[str, object] = {}