mirror of
https://github.com/HKUDS/nanobot.git
synced 2026-05-22 09:32:33 +00:00
fix: allow specifying transcription language
This commit is contained in:
parent
1835f94d8e
commit
bc3d734df5
@ -45,7 +45,7 @@ IMAP_PASSWORD=your-password-here
|
||||
## Providers
|
||||
|
||||
> [!TIP]
|
||||
> - **Voice transcription**: Voice messages (Telegram, WhatsApp) are automatically transcribed using Whisper. By default Groq is used (free tier). Set `"transcriptionProvider": "openai"` under `channels` to use OpenAI Whisper instead — the API key is picked from the matching provider config.
|
||||
> - **Voice transcription**: Voice messages (Telegram, WhatsApp) are automatically transcribed using Whisper. By default Groq is used (free tier). Set `"transcriptionProvider": "openai"` under `channels` to use OpenAI Whisper instead, and optionally set `"transcriptionLanguage": "en"` (or another ISO-639-1 code) for more accurate transcription. The API key is picked from the matching provider config.
|
||||
> - **MiniMax Coding Plan**: Exclusive discount links for the nanobot community: [Overseas](https://platform.minimax.io/subscribe/coding-plan?code=9txpdXw04g&source=link) · [Mainland China](https://platform.minimaxi.com/subscribe/token-plan?code=GILTJpMTqZ&source=link)
|
||||
> - **MiniMax (Mainland China)**: If your API key is from MiniMax's mainland China platform (minimaxi.com), set `"apiBase": "https://api.minimaxi.com/v1"` in your minimax provider config.
|
||||
> - **MiniMax thinking mode**: Use `providers.minimaxAnthropic` when you want `reasoningEffort` / thinking mode. MiniMax exposes that capability through its Anthropic-compatible endpoint, so nanobot keeps it as a separate provider instead of guessing MiniMax-specific thinking parameters on the generic OpenAI-compatible `minimax` endpoint. It uses the same `MINIMAX_API_KEY`. Default Anthropic-compatible base URL: `https://api.minimax.io/anthropic`; for mainland China use `https://api.minimaxi.com/anthropic`.
|
||||
@ -440,6 +440,7 @@ Global settings that apply to all channels. Configure under the `channels` secti
|
||||
"sendToolHints": false,
|
||||
"sendMaxRetries": 3,
|
||||
"transcriptionProvider": "groq",
|
||||
"transcriptionLanguage": null,
|
||||
"telegram": { ... }
|
||||
}
|
||||
}
|
||||
@ -451,6 +452,7 @@ Global settings that apply to all channels. Configure under the `channels` secti
|
||||
| `sendToolHints` | `false` | Stream tool-call hints (e.g. `read_file("…")`) |
|
||||
| `sendMaxRetries` | `3` | Max delivery attempts per outbound message, including the initial send (0-10 configured, minimum 1 actual attempt) |
|
||||
| `transcriptionProvider` | `"groq"` | Voice transcription backend: `"groq"` (free tier, default) or `"openai"`. API key is auto-resolved from the matching provider config. |
|
||||
| `transcriptionLanguage` | `null` | Optional ISO-639-1 language hint for audio transcription, e.g. `"en"`, `"ko"`, `"ja"`. |
|
||||
|
||||
### Retry Behavior
|
||||
|
||||
|
||||
@ -25,7 +25,7 @@ class BaseChannel(ABC):
|
||||
transcription_provider: str = "groq"
|
||||
transcription_api_key: str = ""
|
||||
transcription_api_base: str = ""
|
||||
transcription_language: str = ""
|
||||
transcription_language: str | None = None
|
||||
|
||||
def __init__(self, config: Any, bus: MessageBus):
|
||||
"""
|
||||
@ -49,6 +49,7 @@ class BaseChannel(ABC):
|
||||
provider = OpenAITranscriptionProvider(
|
||||
api_key=self.transcription_api_key,
|
||||
api_base=self.transcription_api_base or None,
|
||||
language=self.transcription_language or None,
|
||||
)
|
||||
else:
|
||||
from nanobot.providers.transcription import GroqTranscriptionProvider
|
||||
|
||||
@ -63,6 +63,7 @@ class ChannelManager:
|
||||
transcription_provider = self.config.channels.transcription_provider
|
||||
transcription_key = self._resolve_transcription_key(transcription_provider)
|
||||
transcription_base = self._resolve_transcription_base(transcription_provider)
|
||||
transcription_language = self.config.channels.transcription_language
|
||||
|
||||
for name, cls in discover_all().items():
|
||||
section = getattr(self.config.channels, name, None)
|
||||
@ -88,7 +89,7 @@ class ChannelManager:
|
||||
channel.transcription_provider = transcription_provider
|
||||
channel.transcription_api_key = transcription_key
|
||||
channel.transcription_api_base = transcription_base
|
||||
channel.transcription_language = getattr(self.config.channels, "transcription_language", "")
|
||||
channel.transcription_language = transcription_language
|
||||
self.channels[name] = channel
|
||||
logger.info("{} channel enabled", cls.display_name)
|
||||
except Exception as e:
|
||||
|
||||
@ -29,7 +29,7 @@ class ChannelsConfig(Base):
|
||||
send_tool_hints: bool = False # stream tool-call hints (e.g. read_file("…"))
|
||||
send_max_retries: int = Field(default=3, ge=0, le=10) # Max delivery attempts (initial send included)
|
||||
transcription_provider: str = "groq" # Voice transcription backend: "groq" or "openai"
|
||||
transcription_language: str = "" # Language code for Whisper STT (e.g. "en", "ru", "zh")
|
||||
transcription_language: str | None = None # Optional ISO-639-1 hint for audio transcription
|
||||
|
||||
|
||||
class DreamConfig(Base):
|
||||
|
||||
@ -10,13 +10,19 @@ from loguru import logger
|
||||
class OpenAITranscriptionProvider:
|
||||
"""Voice transcription provider using OpenAI's Whisper API."""
|
||||
|
||||
def __init__(self, api_key: str | None = None, api_base: str | None = None):
|
||||
def __init__(
|
||||
self,
|
||||
api_key: str | None = None,
|
||||
api_base: str | None = None,
|
||||
language: str | None = None,
|
||||
):
|
||||
self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
|
||||
self.api_url = (
|
||||
api_base
|
||||
or os.environ.get("OPENAI_TRANSCRIPTION_BASE_URL")
|
||||
or "https://api.openai.com/v1/audio/transcriptions"
|
||||
)
|
||||
self.language = language
|
||||
|
||||
async def transcribe(self, file_path: str | Path) -> str:
|
||||
if not self.api_key:
|
||||
@ -30,6 +36,8 @@ class OpenAITranscriptionProvider:
|
||||
async with httpx.AsyncClient() as client:
|
||||
with open(path, "rb") as f:
|
||||
files = {"file": (path.name, f), "model": (None, "whisper-1")}
|
||||
if self.language:
|
||||
files["language"] = (None, self.language)
|
||||
headers = {"Authorization": f"Bearer {self.api_key}"}
|
||||
response = await client.post(
|
||||
self.api_url, headers=headers, files=files, timeout=60.0,
|
||||
@ -48,7 +56,12 @@ class GroqTranscriptionProvider:
|
||||
Groq offers extremely fast transcription with a generous free tier.
|
||||
"""
|
||||
|
||||
def __init__(self, api_key: str | None = None, api_base: str | None = None, language: str | None = None):
|
||||
def __init__(
|
||||
self,
|
||||
api_key: str | None = None,
|
||||
api_base: str | None = None,
|
||||
language: str | None = None,
|
||||
):
|
||||
self.api_key = api_key or os.environ.get("GROQ_API_KEY")
|
||||
self.api_url = api_base or os.environ.get("GROQ_BASE_URL") or "https://api.groq.com/openai/v1/audio/transcriptions"
|
||||
self.language = language
|
||||
|
||||
@ -15,7 +15,6 @@ from nanobot.channels.manager import ChannelManager
|
||||
from nanobot.config.schema import ChannelsConfig
|
||||
from nanobot.utils.restart import RestartNotice
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
@ -200,8 +199,8 @@ async def test_manager_propagates_groq_transcription_api_base_to_channels():
|
||||
fake_config = SimpleNamespace(
|
||||
channels=ChannelsConfig.model_validate({
|
||||
"fakeplugin": {"enabled": True, "allowFrom": ["*"]},
|
||||
"transcriptionLanguage": "en",
|
||||
}),
|
||||
transcription_provider="groq",
|
||||
providers=SimpleNamespace(
|
||||
groq=SimpleNamespace(api_key="groq-key", api_base="http://proxy.local/v1/audio/transcriptions"),
|
||||
openai=SimpleNamespace(api_key="openai-key", api_base="https://api.openai.com/v1/audio/transcriptions"),
|
||||
@ -223,6 +222,7 @@ async def test_manager_propagates_groq_transcription_api_base_to_channels():
|
||||
assert channel.transcription_provider == "groq"
|
||||
assert channel.transcription_api_key == "groq-key"
|
||||
assert channel.transcription_api_base == "http://proxy.local/v1/audio/transcriptions"
|
||||
assert channel.transcription_language == "en"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@ -269,13 +269,15 @@ async def test_base_channel_passes_api_base_to_openai_transcription_provider():
|
||||
channel.transcription_provider = "openai"
|
||||
channel.transcription_api_key = "k"
|
||||
channel.transcription_api_base = "http://override/v1/audio/transcriptions"
|
||||
channel.transcription_language = "en"
|
||||
|
||||
captured: dict[str, object] = {}
|
||||
|
||||
class _StubOpenAI:
|
||||
def __init__(self, api_key=None, api_base=None):
|
||||
def __init__(self, api_key=None, api_base=None, language=None):
|
||||
captured["api_key"] = api_key
|
||||
captured["api_base"] = api_base
|
||||
captured["language"] = language
|
||||
|
||||
async def transcribe(self, file_path):
|
||||
return "ok"
|
||||
@ -286,6 +288,7 @@ async def test_base_channel_passes_api_base_to_openai_transcription_provider():
|
||||
assert result == "ok"
|
||||
assert captured["api_key"] == "k"
|
||||
assert captured["api_base"] == "http://override/v1/audio/transcriptions"
|
||||
assert captured["language"] == "en"
|
||||
|
||||
|
||||
def test_openai_transcription_provider_honors_api_base_argument():
|
||||
@ -300,10 +303,80 @@ def test_openai_transcription_provider_honors_api_base_argument():
|
||||
assert custom.api_url == "http://override/v1/audio/transcriptions"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_base_channel_passes_language_to_groq_transcription_provider():
|
||||
"""BaseChannel.transcribe_audio must forward transcription_language to Groq."""
|
||||
from nanobot.providers import transcription as transcription_mod
|
||||
|
||||
channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus())
|
||||
channel.transcription_provider = "groq"
|
||||
channel.transcription_api_key = "k"
|
||||
channel.transcription_api_base = "http://override/v1/audio/transcriptions"
|
||||
channel.transcription_language = "ko"
|
||||
|
||||
captured: dict[str, object] = {}
|
||||
|
||||
class _StubGroq:
|
||||
def __init__(self, api_key=None, api_base=None, language=None):
|
||||
captured["api_key"] = api_key
|
||||
captured["api_base"] = api_base
|
||||
captured["language"] = language
|
||||
|
||||
async def transcribe(self, file_path):
|
||||
return "ok"
|
||||
|
||||
with patch.object(transcription_mod, "GroqTranscriptionProvider", _StubGroq):
|
||||
result = await channel.transcribe_audio("/tmp/does-not-matter.wav")
|
||||
|
||||
assert result == "ok"
|
||||
assert captured["api_key"] == "k"
|
||||
assert captured["api_base"] == "http://override/v1/audio/transcriptions"
|
||||
assert captured["language"] == "ko"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_groq_transcription_provider_includes_language(tmp_path):
|
||||
from nanobot.providers.transcription import GroqTranscriptionProvider
|
||||
|
||||
audio = tmp_path / "sample.wav"
|
||||
audio.write_bytes(b"audio")
|
||||
captured: dict[str, object] = {}
|
||||
|
||||
class _Response:
|
||||
def raise_for_status(self):
|
||||
return None
|
||||
|
||||
def json(self):
|
||||
return {"text": "hello"}
|
||||
|
||||
class _AsyncClient:
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc, tb):
|
||||
return False
|
||||
|
||||
async def post(self, url, headers=None, files=None, timeout=None):
|
||||
captured["url"] = url
|
||||
captured["headers"] = headers
|
||||
captured["files"] = files
|
||||
captured["timeout"] = timeout
|
||||
return _Response()
|
||||
|
||||
provider = GroqTranscriptionProvider(api_key="k", language="ko")
|
||||
|
||||
with patch("nanobot.providers.transcription.httpx.AsyncClient", return_value=_AsyncClient()):
|
||||
result = await provider.transcribe(audio)
|
||||
|
||||
assert result == "hello"
|
||||
assert captured["files"]["language"] == (None, "ko")
|
||||
|
||||
|
||||
def test_channels_login_uses_discovered_plugin_class(monkeypatch):
|
||||
from typer.testing import CliRunner
|
||||
|
||||
from nanobot.cli.commands import app
|
||||
from nanobot.config.schema import Config
|
||||
from typer.testing import CliRunner
|
||||
|
||||
runner = CliRunner()
|
||||
seen: dict[str, object] = {}
|
||||
@ -329,9 +402,10 @@ def test_channels_login_uses_discovered_plugin_class(monkeypatch):
|
||||
|
||||
|
||||
def test_channels_login_sets_custom_config_path(monkeypatch, tmp_path):
|
||||
from typer.testing import CliRunner
|
||||
|
||||
from nanobot.cli.commands import app
|
||||
from nanobot.config.schema import Config
|
||||
from typer.testing import CliRunner
|
||||
|
||||
runner = CliRunner()
|
||||
seen: dict[str, object] = {}
|
||||
@ -358,9 +432,10 @@ def test_channels_login_sets_custom_config_path(monkeypatch, tmp_path):
|
||||
|
||||
|
||||
def test_channels_status_sets_custom_config_path(monkeypatch, tmp_path):
|
||||
from typer.testing import CliRunner
|
||||
|
||||
from nanobot.cli.commands import app
|
||||
from nanobot.config.schema import Config
|
||||
from typer.testing import CliRunner
|
||||
|
||||
runner = CliRunner()
|
||||
seen: dict[str, object] = {}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user