mirror of
https://github.com/HKUDS/nanobot.git
synced 2026-05-22 01:22:48 +00:00
fix: allow specifying transcription language
This commit is contained in:
parent
1835f94d8e
commit
bc3d734df5
@ -45,7 +45,7 @@ IMAP_PASSWORD=your-password-here
|
|||||||
## Providers
|
## Providers
|
||||||
|
|
||||||
> [!TIP]
|
> [!TIP]
|
||||||
> - **Voice transcription**: Voice messages (Telegram, WhatsApp) are automatically transcribed using Whisper. By default Groq is used (free tier). Set `"transcriptionProvider": "openai"` under `channels` to use OpenAI Whisper instead — the API key is picked from the matching provider config.
|
> - **Voice transcription**: Voice messages (Telegram, WhatsApp) are automatically transcribed using Whisper. By default Groq is used (free tier). Set `"transcriptionProvider": "openai"` under `channels` to use OpenAI Whisper instead, and optionally set `"transcriptionLanguage": "en"` (or another ISO-639-1 code) for more accurate transcription. The API key is picked from the matching provider config.
|
||||||
> - **MiniMax Coding Plan**: Exclusive discount links for the nanobot community: [Overseas](https://platform.minimax.io/subscribe/coding-plan?code=9txpdXw04g&source=link) · [Mainland China](https://platform.minimaxi.com/subscribe/token-plan?code=GILTJpMTqZ&source=link)
|
> - **MiniMax Coding Plan**: Exclusive discount links for the nanobot community: [Overseas](https://platform.minimax.io/subscribe/coding-plan?code=9txpdXw04g&source=link) · [Mainland China](https://platform.minimaxi.com/subscribe/token-plan?code=GILTJpMTqZ&source=link)
|
||||||
> - **MiniMax (Mainland China)**: If your API key is from MiniMax's mainland China platform (minimaxi.com), set `"apiBase": "https://api.minimaxi.com/v1"` in your minimax provider config.
|
> - **MiniMax (Mainland China)**: If your API key is from MiniMax's mainland China platform (minimaxi.com), set `"apiBase": "https://api.minimaxi.com/v1"` in your minimax provider config.
|
||||||
> - **MiniMax thinking mode**: Use `providers.minimaxAnthropic` when you want `reasoningEffort` / thinking mode. MiniMax exposes that capability through its Anthropic-compatible endpoint, so nanobot keeps it as a separate provider instead of guessing MiniMax-specific thinking parameters on the generic OpenAI-compatible `minimax` endpoint. It uses the same `MINIMAX_API_KEY`. Default Anthropic-compatible base URL: `https://api.minimax.io/anthropic`; for mainland China use `https://api.minimaxi.com/anthropic`.
|
> - **MiniMax thinking mode**: Use `providers.minimaxAnthropic` when you want `reasoningEffort` / thinking mode. MiniMax exposes that capability through its Anthropic-compatible endpoint, so nanobot keeps it as a separate provider instead of guessing MiniMax-specific thinking parameters on the generic OpenAI-compatible `minimax` endpoint. It uses the same `MINIMAX_API_KEY`. Default Anthropic-compatible base URL: `https://api.minimax.io/anthropic`; for mainland China use `https://api.minimaxi.com/anthropic`.
|
||||||
@ -440,6 +440,7 @@ Global settings that apply to all channels. Configure under the `channels` secti
|
|||||||
"sendToolHints": false,
|
"sendToolHints": false,
|
||||||
"sendMaxRetries": 3,
|
"sendMaxRetries": 3,
|
||||||
"transcriptionProvider": "groq",
|
"transcriptionProvider": "groq",
|
||||||
|
"transcriptionLanguage": null,
|
||||||
"telegram": { ... }
|
"telegram": { ... }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -451,6 +452,7 @@ Global settings that apply to all channels. Configure under the `channels` secti
|
|||||||
| `sendToolHints` | `false` | Stream tool-call hints (e.g. `read_file("…")`) |
|
| `sendToolHints` | `false` | Stream tool-call hints (e.g. `read_file("…")`) |
|
||||||
| `sendMaxRetries` | `3` | Max delivery attempts per outbound message, including the initial send (0-10 configured, minimum 1 actual attempt) |
|
| `sendMaxRetries` | `3` | Max delivery attempts per outbound message, including the initial send (0-10 configured, minimum 1 actual attempt) |
|
||||||
| `transcriptionProvider` | `"groq"` | Voice transcription backend: `"groq"` (free tier, default) or `"openai"`. API key is auto-resolved from the matching provider config. |
|
| `transcriptionProvider` | `"groq"` | Voice transcription backend: `"groq"` (free tier, default) or `"openai"`. API key is auto-resolved from the matching provider config. |
|
||||||
|
| `transcriptionLanguage` | `null` | Optional ISO-639-1 language hint for audio transcription, e.g. `"en"`, `"ko"`, `"ja"`. |
|
||||||
|
|
||||||
### Retry Behavior
|
### Retry Behavior
|
||||||
|
|
||||||
|
|||||||
@ -25,7 +25,7 @@ class BaseChannel(ABC):
|
|||||||
transcription_provider: str = "groq"
|
transcription_provider: str = "groq"
|
||||||
transcription_api_key: str = ""
|
transcription_api_key: str = ""
|
||||||
transcription_api_base: str = ""
|
transcription_api_base: str = ""
|
||||||
transcription_language: str = ""
|
transcription_language: str | None = None
|
||||||
|
|
||||||
def __init__(self, config: Any, bus: MessageBus):
|
def __init__(self, config: Any, bus: MessageBus):
|
||||||
"""
|
"""
|
||||||
@ -49,6 +49,7 @@ class BaseChannel(ABC):
|
|||||||
provider = OpenAITranscriptionProvider(
|
provider = OpenAITranscriptionProvider(
|
||||||
api_key=self.transcription_api_key,
|
api_key=self.transcription_api_key,
|
||||||
api_base=self.transcription_api_base or None,
|
api_base=self.transcription_api_base or None,
|
||||||
|
language=self.transcription_language or None,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
from nanobot.providers.transcription import GroqTranscriptionProvider
|
from nanobot.providers.transcription import GroqTranscriptionProvider
|
||||||
|
|||||||
@ -63,6 +63,7 @@ class ChannelManager:
|
|||||||
transcription_provider = self.config.channels.transcription_provider
|
transcription_provider = self.config.channels.transcription_provider
|
||||||
transcription_key = self._resolve_transcription_key(transcription_provider)
|
transcription_key = self._resolve_transcription_key(transcription_provider)
|
||||||
transcription_base = self._resolve_transcription_base(transcription_provider)
|
transcription_base = self._resolve_transcription_base(transcription_provider)
|
||||||
|
transcription_language = self.config.channels.transcription_language
|
||||||
|
|
||||||
for name, cls in discover_all().items():
|
for name, cls in discover_all().items():
|
||||||
section = getattr(self.config.channels, name, None)
|
section = getattr(self.config.channels, name, None)
|
||||||
@ -88,7 +89,7 @@ class ChannelManager:
|
|||||||
channel.transcription_provider = transcription_provider
|
channel.transcription_provider = transcription_provider
|
||||||
channel.transcription_api_key = transcription_key
|
channel.transcription_api_key = transcription_key
|
||||||
channel.transcription_api_base = transcription_base
|
channel.transcription_api_base = transcription_base
|
||||||
channel.transcription_language = getattr(self.config.channels, "transcription_language", "")
|
channel.transcription_language = transcription_language
|
||||||
self.channels[name] = channel
|
self.channels[name] = channel
|
||||||
logger.info("{} channel enabled", cls.display_name)
|
logger.info("{} channel enabled", cls.display_name)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@ -29,7 +29,7 @@ class ChannelsConfig(Base):
|
|||||||
send_tool_hints: bool = False # stream tool-call hints (e.g. read_file("…"))
|
send_tool_hints: bool = False # stream tool-call hints (e.g. read_file("…"))
|
||||||
send_max_retries: int = Field(default=3, ge=0, le=10) # Max delivery attempts (initial send included)
|
send_max_retries: int = Field(default=3, ge=0, le=10) # Max delivery attempts (initial send included)
|
||||||
transcription_provider: str = "groq" # Voice transcription backend: "groq" or "openai"
|
transcription_provider: str = "groq" # Voice transcription backend: "groq" or "openai"
|
||||||
transcription_language: str = "" # Language code for Whisper STT (e.g. "en", "ru", "zh")
|
transcription_language: str | None = None # Optional ISO-639-1 hint for audio transcription
|
||||||
|
|
||||||
|
|
||||||
class DreamConfig(Base):
|
class DreamConfig(Base):
|
||||||
|
|||||||
@ -10,13 +10,19 @@ from loguru import logger
|
|||||||
class OpenAITranscriptionProvider:
|
class OpenAITranscriptionProvider:
|
||||||
"""Voice transcription provider using OpenAI's Whisper API."""
|
"""Voice transcription provider using OpenAI's Whisper API."""
|
||||||
|
|
||||||
def __init__(self, api_key: str | None = None, api_base: str | None = None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
api_key: str | None = None,
|
||||||
|
api_base: str | None = None,
|
||||||
|
language: str | None = None,
|
||||||
|
):
|
||||||
self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
|
self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
|
||||||
self.api_url = (
|
self.api_url = (
|
||||||
api_base
|
api_base
|
||||||
or os.environ.get("OPENAI_TRANSCRIPTION_BASE_URL")
|
or os.environ.get("OPENAI_TRANSCRIPTION_BASE_URL")
|
||||||
or "https://api.openai.com/v1/audio/transcriptions"
|
or "https://api.openai.com/v1/audio/transcriptions"
|
||||||
)
|
)
|
||||||
|
self.language = language
|
||||||
|
|
||||||
async def transcribe(self, file_path: str | Path) -> str:
|
async def transcribe(self, file_path: str | Path) -> str:
|
||||||
if not self.api_key:
|
if not self.api_key:
|
||||||
@ -30,6 +36,8 @@ class OpenAITranscriptionProvider:
|
|||||||
async with httpx.AsyncClient() as client:
|
async with httpx.AsyncClient() as client:
|
||||||
with open(path, "rb") as f:
|
with open(path, "rb") as f:
|
||||||
files = {"file": (path.name, f), "model": (None, "whisper-1")}
|
files = {"file": (path.name, f), "model": (None, "whisper-1")}
|
||||||
|
if self.language:
|
||||||
|
files["language"] = (None, self.language)
|
||||||
headers = {"Authorization": f"Bearer {self.api_key}"}
|
headers = {"Authorization": f"Bearer {self.api_key}"}
|
||||||
response = await client.post(
|
response = await client.post(
|
||||||
self.api_url, headers=headers, files=files, timeout=60.0,
|
self.api_url, headers=headers, files=files, timeout=60.0,
|
||||||
@ -48,7 +56,12 @@ class GroqTranscriptionProvider:
|
|||||||
Groq offers extremely fast transcription with a generous free tier.
|
Groq offers extremely fast transcription with a generous free tier.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, api_key: str | None = None, api_base: str | None = None, language: str | None = None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
api_key: str | None = None,
|
||||||
|
api_base: str | None = None,
|
||||||
|
language: str | None = None,
|
||||||
|
):
|
||||||
self.api_key = api_key or os.environ.get("GROQ_API_KEY")
|
self.api_key = api_key or os.environ.get("GROQ_API_KEY")
|
||||||
self.api_url = api_base or os.environ.get("GROQ_BASE_URL") or "https://api.groq.com/openai/v1/audio/transcriptions"
|
self.api_url = api_base or os.environ.get("GROQ_BASE_URL") or "https://api.groq.com/openai/v1/audio/transcriptions"
|
||||||
self.language = language
|
self.language = language
|
||||||
|
|||||||
@ -15,7 +15,6 @@ from nanobot.channels.manager import ChannelManager
|
|||||||
from nanobot.config.schema import ChannelsConfig
|
from nanobot.config.schema import ChannelsConfig
|
||||||
from nanobot.utils.restart import RestartNotice
|
from nanobot.utils.restart import RestartNotice
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Helpers
|
# Helpers
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@ -200,8 +199,8 @@ async def test_manager_propagates_groq_transcription_api_base_to_channels():
|
|||||||
fake_config = SimpleNamespace(
|
fake_config = SimpleNamespace(
|
||||||
channels=ChannelsConfig.model_validate({
|
channels=ChannelsConfig.model_validate({
|
||||||
"fakeplugin": {"enabled": True, "allowFrom": ["*"]},
|
"fakeplugin": {"enabled": True, "allowFrom": ["*"]},
|
||||||
|
"transcriptionLanguage": "en",
|
||||||
}),
|
}),
|
||||||
transcription_provider="groq",
|
|
||||||
providers=SimpleNamespace(
|
providers=SimpleNamespace(
|
||||||
groq=SimpleNamespace(api_key="groq-key", api_base="http://proxy.local/v1/audio/transcriptions"),
|
groq=SimpleNamespace(api_key="groq-key", api_base="http://proxy.local/v1/audio/transcriptions"),
|
||||||
openai=SimpleNamespace(api_key="openai-key", api_base="https://api.openai.com/v1/audio/transcriptions"),
|
openai=SimpleNamespace(api_key="openai-key", api_base="https://api.openai.com/v1/audio/transcriptions"),
|
||||||
@ -223,6 +222,7 @@ async def test_manager_propagates_groq_transcription_api_base_to_channels():
|
|||||||
assert channel.transcription_provider == "groq"
|
assert channel.transcription_provider == "groq"
|
||||||
assert channel.transcription_api_key == "groq-key"
|
assert channel.transcription_api_key == "groq-key"
|
||||||
assert channel.transcription_api_base == "http://proxy.local/v1/audio/transcriptions"
|
assert channel.transcription_api_base == "http://proxy.local/v1/audio/transcriptions"
|
||||||
|
assert channel.transcription_language == "en"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@ -269,13 +269,15 @@ async def test_base_channel_passes_api_base_to_openai_transcription_provider():
|
|||||||
channel.transcription_provider = "openai"
|
channel.transcription_provider = "openai"
|
||||||
channel.transcription_api_key = "k"
|
channel.transcription_api_key = "k"
|
||||||
channel.transcription_api_base = "http://override/v1/audio/transcriptions"
|
channel.transcription_api_base = "http://override/v1/audio/transcriptions"
|
||||||
|
channel.transcription_language = "en"
|
||||||
|
|
||||||
captured: dict[str, object] = {}
|
captured: dict[str, object] = {}
|
||||||
|
|
||||||
class _StubOpenAI:
|
class _StubOpenAI:
|
||||||
def __init__(self, api_key=None, api_base=None):
|
def __init__(self, api_key=None, api_base=None, language=None):
|
||||||
captured["api_key"] = api_key
|
captured["api_key"] = api_key
|
||||||
captured["api_base"] = api_base
|
captured["api_base"] = api_base
|
||||||
|
captured["language"] = language
|
||||||
|
|
||||||
async def transcribe(self, file_path):
|
async def transcribe(self, file_path):
|
||||||
return "ok"
|
return "ok"
|
||||||
@ -286,6 +288,7 @@ async def test_base_channel_passes_api_base_to_openai_transcription_provider():
|
|||||||
assert result == "ok"
|
assert result == "ok"
|
||||||
assert captured["api_key"] == "k"
|
assert captured["api_key"] == "k"
|
||||||
assert captured["api_base"] == "http://override/v1/audio/transcriptions"
|
assert captured["api_base"] == "http://override/v1/audio/transcriptions"
|
||||||
|
assert captured["language"] == "en"
|
||||||
|
|
||||||
|
|
||||||
def test_openai_transcription_provider_honors_api_base_argument():
|
def test_openai_transcription_provider_honors_api_base_argument():
|
||||||
@ -300,10 +303,80 @@ def test_openai_transcription_provider_honors_api_base_argument():
|
|||||||
assert custom.api_url == "http://override/v1/audio/transcriptions"
|
assert custom.api_url == "http://override/v1/audio/transcriptions"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_base_channel_passes_language_to_groq_transcription_provider():
|
||||||
|
"""BaseChannel.transcribe_audio must forward transcription_language to Groq."""
|
||||||
|
from nanobot.providers import transcription as transcription_mod
|
||||||
|
|
||||||
|
channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus())
|
||||||
|
channel.transcription_provider = "groq"
|
||||||
|
channel.transcription_api_key = "k"
|
||||||
|
channel.transcription_api_base = "http://override/v1/audio/transcriptions"
|
||||||
|
channel.transcription_language = "ko"
|
||||||
|
|
||||||
|
captured: dict[str, object] = {}
|
||||||
|
|
||||||
|
class _StubGroq:
|
||||||
|
def __init__(self, api_key=None, api_base=None, language=None):
|
||||||
|
captured["api_key"] = api_key
|
||||||
|
captured["api_base"] = api_base
|
||||||
|
captured["language"] = language
|
||||||
|
|
||||||
|
async def transcribe(self, file_path):
|
||||||
|
return "ok"
|
||||||
|
|
||||||
|
with patch.object(transcription_mod, "GroqTranscriptionProvider", _StubGroq):
|
||||||
|
result = await channel.transcribe_audio("/tmp/does-not-matter.wav")
|
||||||
|
|
||||||
|
assert result == "ok"
|
||||||
|
assert captured["api_key"] == "k"
|
||||||
|
assert captured["api_base"] == "http://override/v1/audio/transcriptions"
|
||||||
|
assert captured["language"] == "ko"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_groq_transcription_provider_includes_language(tmp_path):
|
||||||
|
from nanobot.providers.transcription import GroqTranscriptionProvider
|
||||||
|
|
||||||
|
audio = tmp_path / "sample.wav"
|
||||||
|
audio.write_bytes(b"audio")
|
||||||
|
captured: dict[str, object] = {}
|
||||||
|
|
||||||
|
class _Response:
|
||||||
|
def raise_for_status(self):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def json(self):
|
||||||
|
return {"text": "hello"}
|
||||||
|
|
||||||
|
class _AsyncClient:
|
||||||
|
async def __aenter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def __aexit__(self, exc_type, exc, tb):
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def post(self, url, headers=None, files=None, timeout=None):
|
||||||
|
captured["url"] = url
|
||||||
|
captured["headers"] = headers
|
||||||
|
captured["files"] = files
|
||||||
|
captured["timeout"] = timeout
|
||||||
|
return _Response()
|
||||||
|
|
||||||
|
provider = GroqTranscriptionProvider(api_key="k", language="ko")
|
||||||
|
|
||||||
|
with patch("nanobot.providers.transcription.httpx.AsyncClient", return_value=_AsyncClient()):
|
||||||
|
result = await provider.transcribe(audio)
|
||||||
|
|
||||||
|
assert result == "hello"
|
||||||
|
assert captured["files"]["language"] == (None, "ko")
|
||||||
|
|
||||||
|
|
||||||
def test_channels_login_uses_discovered_plugin_class(monkeypatch):
|
def test_channels_login_uses_discovered_plugin_class(monkeypatch):
|
||||||
|
from typer.testing import CliRunner
|
||||||
|
|
||||||
from nanobot.cli.commands import app
|
from nanobot.cli.commands import app
|
||||||
from nanobot.config.schema import Config
|
from nanobot.config.schema import Config
|
||||||
from typer.testing import CliRunner
|
|
||||||
|
|
||||||
runner = CliRunner()
|
runner = CliRunner()
|
||||||
seen: dict[str, object] = {}
|
seen: dict[str, object] = {}
|
||||||
@ -329,9 +402,10 @@ def test_channels_login_uses_discovered_plugin_class(monkeypatch):
|
|||||||
|
|
||||||
|
|
||||||
def test_channels_login_sets_custom_config_path(monkeypatch, tmp_path):
|
def test_channels_login_sets_custom_config_path(monkeypatch, tmp_path):
|
||||||
|
from typer.testing import CliRunner
|
||||||
|
|
||||||
from nanobot.cli.commands import app
|
from nanobot.cli.commands import app
|
||||||
from nanobot.config.schema import Config
|
from nanobot.config.schema import Config
|
||||||
from typer.testing import CliRunner
|
|
||||||
|
|
||||||
runner = CliRunner()
|
runner = CliRunner()
|
||||||
seen: dict[str, object] = {}
|
seen: dict[str, object] = {}
|
||||||
@ -358,9 +432,10 @@ def test_channels_login_sets_custom_config_path(monkeypatch, tmp_path):
|
|||||||
|
|
||||||
|
|
||||||
def test_channels_status_sets_custom_config_path(monkeypatch, tmp_path):
|
def test_channels_status_sets_custom_config_path(monkeypatch, tmp_path):
|
||||||
|
from typer.testing import CliRunner
|
||||||
|
|
||||||
from nanobot.cli.commands import app
|
from nanobot.cli.commands import app
|
||||||
from nanobot.config.schema import Config
|
from nanobot.config.schema import Config
|
||||||
from typer.testing import CliRunner
|
|
||||||
|
|
||||||
runner = CliRunner()
|
runner = CliRunner()
|
||||||
seen: dict[str, object] = {}
|
seen: dict[str, object] = {}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user