From 0eb3010e40a02faad578893ef8d537e565c8f807 Mon Sep 17 00:00:00 2001 From: Ilia Breitburg Date: Sat, 30 May 2026 22:15:57 +0200 Subject: [PATCH] feat(transcription): configurable STT model + OpenRouter provider Add a `transcriptionModel` channel setting and an OpenRouter transcription backend so voice messages can be transcribed through OpenRouter's speech-to-text endpoint (e.g. nvidia/parakeet-tdt-0.6b-v3, openai/whisper-1), alongside the existing Groq/OpenAI Whisper providers. - schema: add channels.transcriptionModel (None = provider default) - providers/transcription: extract a shared POST/retry skeleton; add a JSON+base64 OpenRouterTranscriptionProvider; make the STT model a constructor param on all providers instead of hardcoding it - channels: route transcriptionProvider="openrouter" and thread the model through the manager to each channel - docs + tests Only dedicated STT models work on OpenRouter's transcription endpoint; chat LLMs (e.g. google/gemini-3.5-flash) are rejected there. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/configuration.md | 8 +- nanobot/audio/transcription.py | 12 +- nanobot/config/schema.py | 2 +- nanobot/providers/transcription.py | 121 +++++++++++++++++++-- nanobot/webui/settings_api.py | 2 +- tests/providers/test_transcription.py | 151 +++++++++++++++++++++++++- tests/webui/test_settings_api.py | 41 +++++++ 7 files changed, 319 insertions(+), 18 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index 3ed500394..06c83353b 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -119,7 +119,7 @@ ANTHROPIC_API_KEY="$(bw get password api/anthropic)" nanobot agent ## Providers > [!TIP] -> - **Voice transcription**: Voice messages and WebUI/desktop microphone input use the shared top-level `transcription` settings. By default Groq Whisper is used; set `transcription.provider` to `"openai"` to use OpenAI Whisper. API keys still live in the matching `providers.` config. +> - **Voice transcription**: Voice messages and WebUI/desktop microphone input use the shared top-level `transcription` settings. By default Groq Whisper is used; set `transcription.provider` to `"openai"` for OpenAI Whisper or `"openrouter"` for OpenRouter speech-to-text models. API keys still live in the matching `providers.` config. > - **MiniMax Coding Plan**: Exclusive discount links for the nanobot community: [Overseas](https://platform.minimax.io/subscribe/coding-plan?code=9txpdXw04g&source=link) · [Mainland China](https://platform.minimaxi.com/subscribe/token-plan?code=GILTJpMTqZ&source=link) > - **MiniMax (Mainland China)**: If your API key is from MiniMax's mainland China platform (minimaxi.com), set `"apiBase": "https://api.minimaxi.com/v1"` in your minimax provider config. > - **MiniMax thinking mode**: Use `providers.minimaxAnthropic` when you want `reasoningEffort` / thinking mode. MiniMax exposes that capability through its Anthropic-compatible endpoint, so nanobot keeps it as a separate provider instead of guessing MiniMax-specific thinking parameters on the generic OpenAI-compatible `minimax` endpoint. It uses the same `MINIMAX_API_KEY`. Default Anthropic-compatible base URL: `https://api.minimax.io/anthropic`; for mainland China use `https://api.minimaxi.com/anthropic`. @@ -134,7 +134,7 @@ ANTHROPIC_API_KEY="$(bw get password api/anthropic)" nanobot agent | Provider | Purpose | Get API Key | |----------|---------|-------------| | `custom` | Any OpenAI-compatible endpoint | — | -| `openrouter` | LLM (recommended, access to all models) | [openrouter.ai](https://openrouter.ai) | +| `openrouter` | LLM (recommended, access to all models) + Voice transcription (STT models) | [openrouter.ai](https://openrouter.ai) | | `huggingface` | LLM (Hugging Face Inference Providers) | [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens) | | `skywork` | LLM (Skywork / APIFree API gateway) | [apifree.ai](https://www.apifree.ai) | | `volcengine` | LLM (VolcEngine, pay-per-use) | [Coding Plan](https://www.volcengine.com/activity/codingplan?utm_campaign=nanobot&utm_content=nanobot&utm_medium=devrel&utm_source=OWO&utm_term=nanobot) · [volcengine.com](https://www.volcengine.com) | @@ -1122,8 +1122,8 @@ Configure transcription under the top-level `transcription` section: | Setting | Default | Description | |---------|---------|-------------| | `enabled` | `true` | Enables audio transcription for both chat-channel voice messages and WebUI/desktop microphone input. | -| `provider` | `"groq"` | Transcription backend: `"groq"` or `"openai"`. | -| `model` | provider default | Optional transcription model override. Defaults to `whisper-large-v3` for Groq and `whisper-1` for OpenAI. | +| `provider` | `"groq"` | Transcription backend: `"groq"`, `"openai"`, or `"openrouter"`. | +| `model` | provider default | Optional transcription model override. Defaults to `whisper-large-v3` for Groq, `whisper-1` for OpenAI, and `openai/whisper-1` for OpenRouter. OpenRouter accepts only speech-to-text models on its transcription endpoint, such as `nvidia/parakeet-tdt-0.6b-v3`, `openai/whisper-1`, or `openai/gpt-4o-transcribe`; chat LLMs are rejected there. | | `language` | `null` | Optional ISO-639 language hint, e.g. `"en"`, `"zh"`, `"ko"`, or `"ja"`. | | `maxDurationSec` | `120` | Maximum WebUI/desktop recording duration. | | `maxUploadMb` | `25` | Maximum WebUI/desktop audio upload size. | diff --git a/nanobot/audio/transcription.py b/nanobot/audio/transcription.py index d27094f3c..cc7cf286d 100644 --- a/nanobot/audio/transcription.py +++ b/nanobot/audio/transcription.py @@ -18,12 +18,13 @@ from loguru import logger from nanobot.config.paths import get_media_dir from nanobot.utils.media_decode import FileSizeExceeded, save_base64_data_url -TranscriptionProviderName = Literal["groq", "openai"] +TranscriptionProviderName = Literal["groq", "openai", "openrouter"] _DEFAULT_PROVIDER: TranscriptionProviderName = "groq" _DEFAULT_MODELS: dict[TranscriptionProviderName, str] = { "groq": "whisper-large-v3", "openai": "whisper-1", + "openrouter": "openai/whisper-1", } _MAX_AUDIO_BYTES_FALLBACK = 25 * 1024 * 1024 _AUDIO_MIME_ALLOWED: frozenset[str] = frozenset({ @@ -171,6 +172,15 @@ async def transcribe_audio_file( language=config.language, model=config.model, ) + elif config.provider == "openrouter": + from nanobot.providers.transcription import OpenRouterTranscriptionProvider + + provider = OpenRouterTranscriptionProvider( + api_key=config.api_key, + api_base=config.api_base or None, + language=config.language, + model=config.model, + ) else: from nanobot.providers.transcription import GroqTranscriptionProvider diff --git a/nanobot/config/schema.py b/nanobot/config/schema.py index 0a19fbfd4..ba72d3729 100644 --- a/nanobot/config/schema.py +++ b/nanobot/config/schema.py @@ -47,7 +47,7 @@ class TranscriptionConfig(Base): """Cross-channel audio transcription configuration.""" enabled: bool = True - provider: Literal["groq", "openai"] | None = None + provider: Literal["groq", "openai", "openrouter"] | None = None model: str | None = None language: str | None = Field(default=None, pattern=r"^[a-z]{2,3}$") max_duration_sec: int = Field(default=120, ge=1, le=600) diff --git a/nanobot/providers/transcription.py b/nanobot/providers/transcription.py index 4af95c4a7..7d4a0c013 100644 --- a/nanobot/providers/transcription.py +++ b/nanobot/providers/transcription.py @@ -1,14 +1,17 @@ """Provider-specific voice transcription adapters. -This module only knows how to call external transcription APIs such as Groq -and OpenAI Whisper. Product-level config fallback, WebUI upload validation, -and channel integration live in ``nanobot.audio.transcription``. +This module only knows how to call external transcription APIs such as Groq, +OpenAI Whisper, and OpenRouter. Product-level config fallback, WebUI upload +validation, and channel integration live in ``nanobot.audio.transcription``. """ import asyncio +import base64 import mimetypes import os +from collections.abc import Callable from pathlib import Path +from typing import Any import httpx from loguru import logger @@ -23,6 +26,13 @@ _AUDIO_MIME_OVERRIDES = { ".weba": "audio/webm", ".webm": "audio/webm", } +_FORMAT_ALIASES = { + "oga": "ogg", + "opus": "ogg", + "mpga": "mp3", + "mpeg": "mp3", + "mp4": "m4a", +} def _resolve_transcription_url(api_base: str | None, default_url: str) -> str: @@ -49,6 +59,12 @@ def _audio_mime_type(path: Path) -> str: ) +def _audio_format(path: Path) -> str: + """Map an audio file's extension to an OpenRouter ``format`` value.""" + ext = path.suffix.lstrip(".").lower() + return _FORMAT_ALIASES.get(ext, ext) + + # Up to 3 retries (4 attempts total) with exponential backoff on transient # failures. Whisper endpoints occasionally return 502/503 under load, and # mobile-network transcription callers hit sporadic connect/read errors. @@ -91,16 +107,61 @@ async def _post_transcription_with_retry( return "" headers = {"Authorization": f"Bearer {api_key}"} + def build_request() -> dict[str, Any]: + files = { + "file": (path.name, data, _audio_mime_type(path)), + "model": (None, model), + } + if language: + files["language"] = (None, language) + return {"url": url, "headers": headers, "files": files, "timeout": 60.0} + + return await _post_with_retry(build_request, provider_label) + + +async def _post_json_transcription_with_retry( + url: str, + *, + api_key: str | None, + path: Path, + model: str, + provider_label: str, + language: str | None = None, +) -> str: + """POST base64 JSON audio for providers that do not accept multipart uploads.""" + try: + data = path.read_bytes() + except OSError as e: + logger.exception("{} transcription error: cannot read audio file: {}", provider_label, e) + return "" + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + } + + def build_request() -> dict[str, Any]: + body: dict[str, object] = { + "model": model, + "input_audio": { + "data": base64.b64encode(data).decode(), + "format": _audio_format(path), + }, + } + if language: + body["language"] = language + return {"url": url, "headers": headers, "json": body, "timeout": 60.0} + + return await _post_with_retry(build_request, provider_label) + + +async def _post_with_retry( + build_request: Callable[[], dict[str, Any]], + provider_label: str, +) -> str: async with httpx.AsyncClient() as client: for attempt in range(_MAX_RETRIES + 1): - files = { - "file": (path.name, data, _audio_mime_type(path)), - "model": (None, model), - } - if language: - files["language"] = (None, language) try: - response = await client.post(url, headers=headers, files=files, timeout=60.0) + response = await client.post(**build_request()) except _RETRYABLE_EXCEPTIONS as e: if attempt < _MAX_RETRIES: logger.warning( @@ -167,6 +228,7 @@ async def _post_transcription_with_retry( ) return "" return payload.get("text", "") + return "" class OpenAITranscriptionProvider: @@ -256,3 +318,42 @@ class GroqTranscriptionProvider: provider_label="Groq", language=self.language, ) + + +class OpenRouterTranscriptionProvider: + """Voice transcription provider using OpenRouter's speech-to-text endpoint.""" + + def __init__( + self, + api_key: str | None = None, + api_base: str | None = None, + language: str | None = None, + model: str | None = None, + ): + self.api_key = api_key or os.environ.get("OPENROUTER_API_KEY") + self.api_url = _resolve_transcription_url( + api_base or os.environ.get("OPENROUTER_BASE_URL"), + "https://openrouter.ai/api/v1/audio/transcriptions", + ) + self.language = language or None + self.model = model or "openai/whisper-1" + logger.debug("OpenRouter transcription endpoint: {}", self.api_url) + + async def transcribe(self, file_path: str | Path) -> str: + if not self.api_key: + logger.warning("OpenRouter API key not configured for transcription") + return "" + + path = Path(file_path) + if not path.exists(): + logger.error("Audio file not found: {}", file_path) + return "" + + return await _post_json_transcription_with_retry( + self.api_url, + api_key=self.api_key, + path=path, + model=self.model, + provider_label="OpenRouter", + language=self.language, + ) diff --git a/nanobot/webui/settings_api.py b/nanobot/webui/settings_api.py index 3b90fe081..cc6d76f82 100644 --- a/nanobot/webui/settings_api.py +++ b/nanobot/webui/settings_api.py @@ -91,7 +91,7 @@ _IMAGE_GENERATION_ASPECT_RATIOS = { "2:3", "21:9", } -_TRANSCRIPTION_PROVIDERS = ("groq", "openai") +_TRANSCRIPTION_PROVIDERS = ("groq", "openai", "openrouter") _CONTEXT_WINDOW_TOKEN_OPTIONS = {65_536, 262_144} _MODEL_CONFIGURATION_SLUG_RE = re.compile(r"[^a-z0-9_-]+") _ENV_REF_RE = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}") diff --git a/tests/providers/test_transcription.py b/tests/providers/test_transcription.py index c669a91d3..3fa3714da 100644 --- a/tests/providers/test_transcription.py +++ b/tests/providers/test_transcription.py @@ -2,17 +2,24 @@ from __future__ import annotations +import base64 from pathlib import Path from unittest.mock import AsyncMock, patch import httpx import pytest -from nanobot.audio.transcription import resolve_transcription_config +from nanobot.audio.transcription import ( + EffectiveTranscriptionConfig, + resolve_transcription_config, + transcribe_audio_file, +) from nanobot.config.schema import Config from nanobot.providers.transcription import ( GroqTranscriptionProvider, OpenAITranscriptionProvider, + OpenRouterTranscriptionProvider, + _audio_format, _resolve_transcription_url, ) @@ -71,6 +78,59 @@ def test_resolver_prefers_top_level_transcription_over_legacy_channels() -> None assert resolved.api_base == "https://groq.example/openai/v1" +def test_resolver_supports_openrouter_transcription_provider() -> None: + config = Config() + config.transcription.provider = "openrouter" + config.transcription.model = "nvidia/parakeet-tdt-0.6b-v3" + config.transcription.language = "en" + config.providers.openrouter.api_key = "sk-or-test" + config.providers.openrouter.api_base = "https://openrouter.ai/api/v1" + + resolved = resolve_transcription_config(config) + + assert resolved.provider == "openrouter" + assert resolved.model == "nvidia/parakeet-tdt-0.6b-v3" + assert resolved.language == "en" + assert resolved.api_key == "sk-or-test" + assert resolved.api_base == "https://openrouter.ai/api/v1" + + +@pytest.mark.asyncio +async def test_transcribe_audio_file_routes_openrouter_provider(audio_file: Path) -> None: + captured: dict[str, object] = {} + + class StubOpenRouter: + def __init__(self, **kwargs): + captured.update(kwargs) + + async def transcribe(self, file_path: str | Path) -> str: + captured["file_path"] = Path(file_path) + return "openrouter ok" + + config = EffectiveTranscriptionConfig( + enabled=True, + provider="openrouter", + model="nvidia/parakeet-tdt-0.6b-v3", + language="en", + api_key="sk-or-test", + api_base="https://openrouter.ai/api/v1", + max_duration_sec=120, + max_upload_mb=25, + ) + + with patch("nanobot.providers.transcription.OpenRouterTranscriptionProvider", StubOpenRouter): + result = await transcribe_audio_file(audio_file, config) + + assert result == "openrouter ok" + assert captured == { + "api_key": "sk-or-test", + "api_base": "https://openrouter.ai/api/v1", + "language": "en", + "model": "nvidia/parakeet-tdt-0.6b-v3", + "file_path": audio_file, + } + + def test_resolved_transcription_repr_hides_api_key() -> None: config = Config() config.providers.groq.api_key = "gsk-secret" @@ -347,6 +407,95 @@ async def test_returns_empty_on_non_dict_json_body(audio_file: Path) -> None: # --------------------------------------------------------------------------- +# --------------------------------------------------------------------------- +# Configurable model: forwarded to the multipart "model" field on all providers +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "provider_cls,default_model", + [(OpenAITranscriptionProvider, "whisper-1"), (GroqTranscriptionProvider, "whisper-large-v3")], + ids=["openai", "groq"], +) +def test_multipart_provider_model_defaults_and_override(provider_cls, default_model): + assert provider_cls(api_key="k").model == default_model + assert provider_cls(api_key="k", model="custom-stt").model == "custom-stt" + + +@pytest.mark.parametrize( + "provider_cls", + [OpenAITranscriptionProvider, GroqTranscriptionProvider], + ids=["openai", "groq"], +) +@pytest.mark.asyncio +async def test_multipart_provider_sends_configured_model(audio_file: Path, provider_cls) -> None: + provider = provider_cls(api_key="k", model="my-stt-model") + post = AsyncMock(return_value=_response(200, {"text": "ok"})) + with patch("httpx.AsyncClient.post", post), patch("asyncio.sleep", AsyncMock()): + assert await provider.transcribe(audio_file) == "ok" + assert post.await_args_list[0].kwargs["files"]["model"] == (None, "my-stt-model") + + +# --------------------------------------------------------------------------- +# OpenRouter provider — JSON body with base64 audio + configurable STT model +# --------------------------------------------------------------------------- + + +def test_audio_format_maps_known_extensions() -> None: + assert _audio_format(Path("v.oga")) == "ogg" # Telegram voice notes + assert _audio_format(Path("v.opus")) == "ogg" + assert _audio_format(Path("v.mp4")) == "m4a" + assert _audio_format(Path("v.mp3")) == "mp3" + assert _audio_format(Path("v.wav")) == "wav" # passthrough for unknown + + +def test_openrouter_defaults_and_chat_base_normalization() -> None: + default = OpenRouterTranscriptionProvider(api_key="k") + assert default.api_url == "https://openrouter.ai/api/v1/audio/transcriptions" + assert default.model == "openai/whisper-1" + + # A chat-style base (what users copy from provider config) gets the path appended. + chat_base = OpenRouterTranscriptionProvider(api_key="k", api_base="https://openrouter.ai/api/v1") + assert chat_base.api_url == "https://openrouter.ai/api/v1/audio/transcriptions" + + +@pytest.mark.asyncio +async def test_openrouter_sends_json_base64_body(audio_file: Path) -> None: + """OpenRouter gets a JSON body with base64 audio + format — never multipart.""" + provider = OpenRouterTranscriptionProvider( + api_key="k", model="nvidia/parakeet-tdt-0.6b-v3", language="en" + ) + post = AsyncMock(return_value=_response(200, {"text": "hi"})) + with patch("httpx.AsyncClient.post", post), patch("asyncio.sleep", AsyncMock()): + assert await provider.transcribe(audio_file) == "hi" + call = post.await_args_list[0].kwargs + assert "files" not in call # not multipart + body = call["json"] + assert body["model"] == "nvidia/parakeet-tdt-0.6b-v3" + assert body["language"] == "en" + assert body["input_audio"]["format"] == "ogg" # .ogg fixture + assert base64.b64decode(body["input_audio"]["data"]) == audio_file.read_bytes() + + +@pytest.mark.asyncio +async def test_openrouter_omits_language_when_unset(audio_file: Path) -> None: + provider = OpenRouterTranscriptionProvider(api_key="k", model="openai/whisper-1") + post = AsyncMock(return_value=_response(200, {"text": "ok"})) + with patch("httpx.AsyncClient.post", post), patch("asyncio.sleep", AsyncMock()): + assert await provider.transcribe(audio_file) == "ok" + assert "language" not in post.await_args_list[0].kwargs["json"] + + +@pytest.mark.asyncio +async def test_openrouter_shares_retry_contract(audio_file: Path) -> None: + """OpenRouter goes through the same retry helper: 503 retried, then 200.""" + provider = OpenRouterTranscriptionProvider(api_key="k", model="openai/whisper-1") + post = AsyncMock(side_effect=[_response(503), _response(200, {"text": "recovered"})]) + with patch("httpx.AsyncClient.post", post), patch("asyncio.sleep", AsyncMock()): + assert await provider.transcribe(audio_file) == "recovered" + assert post.await_count == 2 + + @pytest.mark.parametrize("status", [408, 429, 500, 502, 503, 504]) @pytest.mark.asyncio async def test_retries_on_every_advertised_transient_status( diff --git a/tests/webui/test_settings_api.py b/tests/webui/test_settings_api.py index b9043816c..80fcf29b1 100644 --- a/tests/webui/test_settings_api.py +++ b/tests/webui/test_settings_api.py @@ -265,6 +265,23 @@ def test_settings_payload_includes_effective_transcription_config( assert payload["transcription"]["language"] == "en" +def test_settings_payload_exposes_openrouter_transcription_provider( + tmp_path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + config_path = tmp_path / "config.json" + config = Config() + config.providers.openrouter.api_key = "sk-or-test" + save_config(config, config_path) + monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path) + + payload = settings_payload() + + providers = {provider["name"]: provider for provider in payload["transcription"]["providers"]} + assert providers["openrouter"]["label"] == "OpenRouter" + assert providers["openrouter"]["configured"] is True + + def test_update_transcription_settings_writes_top_level_only( tmp_path, monkeypatch: pytest.MonkeyPatch, @@ -301,6 +318,30 @@ def test_update_transcription_settings_writes_top_level_only( assert payload["transcription"]["provider_configured"] is True +def test_update_transcription_settings_accepts_openrouter( + tmp_path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + config_path = tmp_path / "config.json" + config = Config() + config.providers.openrouter.api_key = "sk-or-test" + save_config(config, config_path) + monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path) + + payload = update_transcription_settings( + { + "provider": ["openrouter"], + "model": ["nvidia/parakeet-tdt-0.6b-v3"], + } + ) + + saved = load_config(config_path) + assert saved.transcription.provider == "openrouter" + assert saved.transcription.model == "nvidia/parakeet-tdt-0.6b-v3" + assert payload["transcription"]["provider"] == "openrouter" + assert payload["transcription"]["provider_configured"] is True + + def test_update_transcription_settings_validates_language( tmp_path, monkeypatch: pytest.MonkeyPatch,