From c20ecc52d7a1a46bccf3c08c71f88d522e625d77 Mon Sep 17 00:00:00 2001 From: NanoBot Date: Wed, 3 Jun 2026 16:21:35 +0800 Subject: [PATCH] feat(transcription): add Xiaomi MiMo ASR provider (mimo-v2.5-asr) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for Xiaomi MiMo ASR as a third transcription backend alongside Groq and OpenAI Whisper. Xiaomi ASR uses the /v1/chat/completions endpoint with base64-encoded audio input, rather than the standard Whisper multipart upload format. Co-Authored-By:连 --- docs/configuration.md | 6 +- nanobot/audio/transcription.py | 18 +++- nanobot/config/schema.py | 2 +- nanobot/providers/transcription.py | 123 +++++++++++++++++++++++- nanobot/webui/settings_api.py | 2 +- tests/providers/test_transcription.py | 132 ++++++++++++++++++++++++++ tests/webui/test_settings_api.py | 43 +++++++++ 7 files changed, 315 insertions(+), 11 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index 06c83353b..1ae86d5fc 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -119,7 +119,7 @@ ANTHROPIC_API_KEY="$(bw get password api/anthropic)" nanobot agent ## Providers > [!TIP] -> - **Voice transcription**: Voice messages and WebUI/desktop microphone input use the shared top-level `transcription` settings. By default Groq Whisper is used; set `transcription.provider` to `"openai"` for OpenAI Whisper or `"openrouter"` for OpenRouter speech-to-text models. API keys still live in the matching `providers.` config. +> - **Voice transcription**: Voice messages and WebUI/desktop microphone input use the shared top-level `transcription` settings. By default Groq Whisper is used; set `transcription.provider` to `"openai"` for OpenAI Whisper, `"openrouter"` for OpenRouter speech-to-text models, or `"xiaomi_mimo"` for Xiaomi MiMo ASR. API keys still live in the matching `providers.` config. > - **MiniMax Coding Plan**: Exclusive discount links for the nanobot community: [Overseas](https://platform.minimax.io/subscribe/coding-plan?code=9txpdXw04g&source=link) · [Mainland China](https://platform.minimaxi.com/subscribe/token-plan?code=GILTJpMTqZ&source=link) > - **MiniMax (Mainland China)**: If your API key is from MiniMax's mainland China platform (minimaxi.com), set `"apiBase": "https://api.minimaxi.com/v1"` in your minimax provider config. > - **MiniMax thinking mode**: Use `providers.minimaxAnthropic` when you want `reasoningEffort` / thinking mode. MiniMax exposes that capability through its Anthropic-compatible endpoint, so nanobot keeps it as a separate provider instead of guessing MiniMax-specific thinking parameters on the generic OpenAI-compatible `minimax` endpoint. It uses the same `MINIMAX_API_KEY`. Default Anthropic-compatible base URL: `https://api.minimax.io/anthropic`; for mainland China use `https://api.minimaxi.com/anthropic`. @@ -1122,8 +1122,8 @@ Configure transcription under the top-level `transcription` section: | Setting | Default | Description | |---------|---------|-------------| | `enabled` | `true` | Enables audio transcription for both chat-channel voice messages and WebUI/desktop microphone input. | -| `provider` | `"groq"` | Transcription backend: `"groq"`, `"openai"`, or `"openrouter"`. | -| `model` | provider default | Optional transcription model override. Defaults to `whisper-large-v3` for Groq, `whisper-1` for OpenAI, and `openai/whisper-1` for OpenRouter. OpenRouter accepts only speech-to-text models on its transcription endpoint, such as `nvidia/parakeet-tdt-0.6b-v3`, `openai/whisper-1`, or `openai/gpt-4o-transcribe`; chat LLMs are rejected there. | +| `provider` | `"groq"` | Transcription backend: `"groq"`, `"openai"`, `"openrouter"`, or `"xiaomi_mimo"`. | +| `model` | provider default | Optional transcription model override. Defaults to `whisper-large-v3` for Groq, `whisper-1` for OpenAI, `openai/whisper-1` for OpenRouter, and `mimo-v2.5-asr` for Xiaomi MiMo ASR. OpenRouter accepts only speech-to-text models on its transcription endpoint, such as `nvidia/parakeet-tdt-0.6b-v3`, `openai/whisper-1`, or `openai/gpt-4o-transcribe`; chat LLMs are rejected there. | | `language` | `null` | Optional ISO-639 language hint, e.g. `"en"`, `"zh"`, `"ko"`, or `"ja"`. | | `maxDurationSec` | `120` | Maximum WebUI/desktop recording duration. | | `maxUploadMb` | `25` | Maximum WebUI/desktop audio upload size. | diff --git a/nanobot/audio/transcription.py b/nanobot/audio/transcription.py index cc7cf286d..7e97517fa 100644 --- a/nanobot/audio/transcription.py +++ b/nanobot/audio/transcription.py @@ -18,13 +18,18 @@ from loguru import logger from nanobot.config.paths import get_media_dir from nanobot.utils.media_decode import FileSizeExceeded, save_base64_data_url -TranscriptionProviderName = Literal["groq", "openai", "openrouter"] +TranscriptionProviderName = Literal["groq", "openai", "openrouter", "xiaomi_mimo"] _DEFAULT_PROVIDER: TranscriptionProviderName = "groq" _DEFAULT_MODELS: dict[TranscriptionProviderName, str] = { "groq": "whisper-large-v3", "openai": "whisper-1", "openrouter": "openai/whisper-1", + "xiaomi_mimo": "mimo-v2.5-asr", +} +_PROVIDER_ALIASES: dict[str, TranscriptionProviderName] = { + "mimo": "xiaomi_mimo", + "xiaomi": "xiaomi_mimo", } _MAX_AUDIO_BYTES_FALLBACK = 25 * 1024 * 1024 _AUDIO_MIME_ALLOWED: frozenset[str] = frozenset({ @@ -69,6 +74,8 @@ class TranscriptionIngressError(Exception): def _as_provider(value: Any) -> TranscriptionProviderName | None: if isinstance(value, str): name = value.strip().lower() + if name in _PROVIDER_ALIASES: + return _PROVIDER_ALIASES[name] if name in _DEFAULT_MODELS: return name # type: ignore[return-value] return None @@ -181,6 +188,15 @@ async def transcribe_audio_file( language=config.language, model=config.model, ) + elif config.provider == "xiaomi_mimo": + from nanobot.providers.transcription import XiaomiMiMoTranscriptionProvider + + provider = XiaomiMiMoTranscriptionProvider( + api_key=config.api_key, + api_base=config.api_base or None, + language=config.language, + model=config.model, + ) else: from nanobot.providers.transcription import GroqTranscriptionProvider diff --git a/nanobot/config/schema.py b/nanobot/config/schema.py index ba72d3729..e597052d6 100644 --- a/nanobot/config/schema.py +++ b/nanobot/config/schema.py @@ -47,7 +47,7 @@ class TranscriptionConfig(Base): """Cross-channel audio transcription configuration.""" enabled: bool = True - provider: Literal["groq", "openai", "openrouter"] | None = None + provider: Literal["groq", "openai", "openrouter", "xiaomi_mimo"] | None = None model: str | None = None language: str | None = Field(default=None, pattern=r"^[a-z]{2,3}$") max_duration_sec: int = Field(default=120, ge=1, le=600) diff --git a/nanobot/providers/transcription.py b/nanobot/providers/transcription.py index 7d4a0c013..997228bd0 100644 --- a/nanobot/providers/transcription.py +++ b/nanobot/providers/transcription.py @@ -1,8 +1,9 @@ """Provider-specific voice transcription adapters. This module only knows how to call external transcription APIs such as Groq, -OpenAI Whisper, and OpenRouter. Product-level config fallback, WebUI upload -validation, and channel integration live in ``nanobot.audio.transcription``. +OpenAI Whisper, OpenRouter, and Xiaomi MiMo ASR. Product-level config fallback, +WebUI upload validation, and channel integration live in +``nanobot.audio.transcription``. """ import asyncio @@ -16,6 +17,7 @@ from typing import Any import httpx from loguru import logger +_CHAT_COMPLETIONS_PATH = "chat/completions" _TRANSCRIPTIONS_PATH = "audio/transcriptions" _AUDIO_MIME_OVERRIDES = { ".m4a": "audio/mp4", @@ -51,6 +53,16 @@ def _resolve_transcription_url(api_base: str | None, default_url: str) -> str: return f"{base}/{_TRANSCRIPTIONS_PATH}" +def _resolve_chat_completions_url(api_base: str | None, default_url: str) -> str: + """Resolve a chat-completions endpoint for ASR providers using chat payloads.""" + if not api_base: + return default_url + base = api_base.rstrip("/") + if base.endswith(_CHAT_COMPLETIONS_PATH): + return base + return f"{base}/{_CHAT_COMPLETIONS_PATH}" + + def _audio_mime_type(path: Path) -> str: return ( _AUDIO_MIME_OVERRIDES.get(path.suffix.lower()) @@ -116,7 +128,7 @@ async def _post_transcription_with_retry( files["language"] = (None, language) return {"url": url, "headers": headers, "files": files, "timeout": 60.0} - return await _post_with_retry(build_request, provider_label) + return await _post_with_retry(build_request, provider_label, _text_from_transcription_payload) async def _post_json_transcription_with_retry( @@ -151,12 +163,61 @@ async def _post_json_transcription_with_retry( body["language"] = language return {"url": url, "headers": headers, "json": body, "timeout": 60.0} - return await _post_with_retry(build_request, provider_label) + return await _post_with_retry(build_request, provider_label, _text_from_transcription_payload) + + +async def _post_xiaomi_mimo_asr_with_retry( + url: str, + *, + api_key: str | None, + path: Path, + model: str, + provider_label: str, + language: str | None = None, +) -> str: + """POST audio to Xiaomi MiMo ASR's chat-completions transcription API.""" + try: + data = path.read_bytes() + except OSError as e: + logger.exception("{} transcription error: cannot read audio file: {}", provider_label, e) + return "" + + body: dict[str, Any] = { + "model": model, + "messages": [ + { + "role": "user", + "content": [ + { + "type": "input_audio", + "input_audio": { + "data": ( + f"data:{_audio_mime_type(path)};base64," + f"{base64.b64encode(data).decode('ascii')}" + ), + }, + } + ], + } + ], + } + if language: + body["asr_options"] = {"language": language} + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + } + + def build_request() -> dict[str, Any]: + return {"url": url, "headers": headers, "json": body, "timeout": 60.0} + + return await _post_with_retry(build_request, provider_label, _text_from_chat_payload) async def _post_with_retry( build_request: Callable[[], dict[str, Any]], provider_label: str, + extract_text: Callable[[dict[str, Any]], str], ) -> str: async with httpx.AsyncClient() as client: for attempt in range(_MAX_RETRIES + 1): @@ -227,10 +288,23 @@ async def _post_with_retry( type(payload).__name__, ) return "" - return payload.get("text", "") + return extract_text(payload) return "" +def _text_from_transcription_payload(payload: dict[str, Any]) -> str: + text = payload.get("text") + return text if isinstance(text, str) else "" + + +def _text_from_chat_payload(payload: dict[str, Any]) -> str: + try: + text = payload["choices"][0]["message"]["content"] + except (KeyError, IndexError, TypeError): + return "" + return text if isinstance(text, str) else "" + + class OpenAITranscriptionProvider: """Voice transcription provider using OpenAI's Whisper API.""" @@ -357,3 +431,42 @@ class OpenRouterTranscriptionProvider: provider_label="OpenRouter", language=self.language, ) + + +class XiaomiMiMoTranscriptionProvider: + """Voice transcription provider using Xiaomi MiMo ASR.""" + + def __init__( + self, + api_key: str | None = None, + api_base: str | None = None, + language: str | None = None, + model: str | None = None, + ): + self.api_key = api_key or os.environ.get("MIMO_API_KEY") + self.api_url = _resolve_chat_completions_url( + api_base or os.environ.get("MIMO_API_BASE"), + "https://api.xiaomimimo.com/v1/chat/completions", + ) + self.language = language or None + self.model = model or "mimo-v2.5-asr" + logger.debug("Xiaomi MiMo transcription endpoint: {}", self.api_url) + + async def transcribe(self, file_path: str | Path) -> str: + if not self.api_key: + logger.warning("Xiaomi MiMo API key not configured for transcription") + return "" + + path = Path(file_path) + if not path.exists(): + logger.error("Audio file not found: {}", file_path) + return "" + + return await _post_xiaomi_mimo_asr_with_retry( + self.api_url, + api_key=self.api_key, + path=path, + model=self.model, + provider_label="Xiaomi MiMo", + language=self.language, + ) diff --git a/nanobot/webui/settings_api.py b/nanobot/webui/settings_api.py index cc6d76f82..71c7e08bf 100644 --- a/nanobot/webui/settings_api.py +++ b/nanobot/webui/settings_api.py @@ -91,7 +91,7 @@ _IMAGE_GENERATION_ASPECT_RATIOS = { "2:3", "21:9", } -_TRANSCRIPTION_PROVIDERS = ("groq", "openai", "openrouter") +_TRANSCRIPTION_PROVIDERS = ("groq", "openai", "openrouter", "xiaomi_mimo") _CONTEXT_WINDOW_TOKEN_OPTIONS = {65_536, 262_144} _MODEL_CONFIGURATION_SLUG_RE = re.compile(r"[^a-z0-9_-]+") _ENV_REF_RE = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}") diff --git a/tests/providers/test_transcription.py b/tests/providers/test_transcription.py index 3fa3714da..574d5a44b 100644 --- a/tests/providers/test_transcription.py +++ b/tests/providers/test_transcription.py @@ -19,7 +19,9 @@ from nanobot.providers.transcription import ( GroqTranscriptionProvider, OpenAITranscriptionProvider, OpenRouterTranscriptionProvider, + XiaomiMiMoTranscriptionProvider, _audio_format, + _resolve_chat_completions_url, _resolve_transcription_url, ) @@ -95,6 +97,37 @@ def test_resolver_supports_openrouter_transcription_provider() -> None: assert resolved.api_base == "https://openrouter.ai/api/v1" +def test_resolver_supports_xiaomi_mimo_transcription_provider() -> None: + config = Config() + config.transcription.provider = "xiaomi_mimo" + config.transcription.model = "mimo-v2.5-asr" + config.transcription.language = "zh" + config.providers.xiaomi_mimo.api_key = "mimo-test" + config.providers.xiaomi_mimo.api_base = "https://api.xiaomimimo.com/v1" + + resolved = resolve_transcription_config(config) + + assert resolved.provider == "xiaomi_mimo" + assert resolved.model == "mimo-v2.5-asr" + assert resolved.language == "zh" + assert resolved.api_key == "mimo-test" + assert resolved.api_base == "https://api.xiaomimimo.com/v1" + + +def test_resolver_accepts_legacy_xiaomi_transcription_alias() -> None: + config = Config() + config.channels.transcription_provider = "xiaomi" + config.channels.transcription_language = "zh" + config.providers.xiaomi_mimo.api_key = "mimo-test" + + resolved = resolve_transcription_config(config) + + assert resolved.provider == "xiaomi_mimo" + assert resolved.model == "mimo-v2.5-asr" + assert resolved.language == "zh" + assert resolved.api_key == "mimo-test" + + @pytest.mark.asyncio async def test_transcribe_audio_file_routes_openrouter_provider(audio_file: Path) -> None: captured: dict[str, object] = {} @@ -131,6 +164,42 @@ async def test_transcribe_audio_file_routes_openrouter_provider(audio_file: Path } +@pytest.mark.asyncio +async def test_transcribe_audio_file_routes_xiaomi_mimo_provider(audio_file: Path) -> None: + captured: dict[str, object] = {} + + class StubXiaomiMiMo: + def __init__(self, **kwargs): + captured.update(kwargs) + + async def transcribe(self, file_path: str | Path) -> str: + captured["file_path"] = Path(file_path) + return "mimo ok" + + config = EffectiveTranscriptionConfig( + enabled=True, + provider="xiaomi_mimo", + model="mimo-v2.5-asr", + language="zh", + api_key="mimo-test", + api_base="https://api.xiaomimimo.com/v1", + max_duration_sec=120, + max_upload_mb=25, + ) + + with patch("nanobot.providers.transcription.XiaomiMiMoTranscriptionProvider", StubXiaomiMiMo): + result = await transcribe_audio_file(audio_file, config) + + assert result == "mimo ok" + assert captured == { + "api_key": "mimo-test", + "api_base": "https://api.xiaomimimo.com/v1", + "language": "zh", + "model": "mimo-v2.5-asr", + "file_path": audio_file, + } + + def test_resolved_transcription_repr_hides_api_key() -> None: config = Config() config.providers.groq.api_key = "gsk-secret" @@ -496,6 +565,69 @@ async def test_openrouter_shares_retry_contract(audio_file: Path) -> None: assert post.await_count == 2 +def test_resolve_chat_completions_url_appends_path_to_base() -> None: + default = "https://api.xiaomimimo.com/v1/chat/completions" + assert _resolve_chat_completions_url(None, default) == default + assert ( + _resolve_chat_completions_url("https://api.xiaomimimo.com/v1", default) + == "https://api.xiaomimimo.com/v1/chat/completions" + ) + assert _resolve_chat_completions_url(default, "https://x/chat/completions") == default + + +def test_xiaomi_mimo_defaults_and_base_normalization() -> None: + provider = XiaomiMiMoTranscriptionProvider(api_key="k") + assert provider.api_url == "https://api.xiaomimimo.com/v1/chat/completions" + assert provider.model == "mimo-v2.5-asr" + + custom = XiaomiMiMoTranscriptionProvider( + api_key="k", + api_base="https://token-plan-sgp.xiaomimimo.com/v1", + model="custom-asr", + ) + assert custom.api_url == "https://token-plan-sgp.xiaomimimo.com/v1/chat/completions" + assert custom.model == "custom-asr" + + +@pytest.mark.asyncio +async def test_xiaomi_mimo_sends_chat_completion_audio_payload(audio_file: Path) -> None: + provider = XiaomiMiMoTranscriptionProvider(api_key="k", language="zh") + post = AsyncMock( + return_value=_response( + 200, + {"choices": [{"message": {"content": "你好"}}]}, + ) + ) + + with patch("httpx.AsyncClient.post", post), patch("asyncio.sleep", AsyncMock()): + assert await provider.transcribe(audio_file) == "你好" + + call = post.await_args_list[0].kwargs + assert "files" not in call + body = call["json"] + assert body["model"] == "mimo-v2.5-asr" + assert body["asr_options"] == {"language": "zh"} + audio = body["messages"][0]["content"][0]["input_audio"]["data"] + assert audio.startswith("data:audio/ogg;base64,") + assert base64.b64decode(audio.split(",", 1)[1]) == audio_file.read_bytes() + + +@pytest.mark.asyncio +async def test_xiaomi_mimo_shares_retry_contract(audio_file: Path) -> None: + provider = XiaomiMiMoTranscriptionProvider(api_key="k") + post = AsyncMock( + side_effect=[ + _response(503), + _response(200, {"choices": [{"message": {"content": "ok"}}]}), + ] + ) + + with patch("httpx.AsyncClient.post", post), patch("asyncio.sleep", AsyncMock()): + assert await provider.transcribe(audio_file) == "ok" + + assert post.await_count == 2 + + @pytest.mark.parametrize("status", [408, 429, 500, 502, 503, 504]) @pytest.mark.asyncio async def test_retries_on_every_advertised_transient_status( diff --git a/tests/webui/test_settings_api.py b/tests/webui/test_settings_api.py index 80fcf29b1..754a74449 100644 --- a/tests/webui/test_settings_api.py +++ b/tests/webui/test_settings_api.py @@ -282,6 +282,23 @@ def test_settings_payload_exposes_openrouter_transcription_provider( assert providers["openrouter"]["configured"] is True +def test_settings_payload_exposes_xiaomi_mimo_transcription_provider( + tmp_path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + config_path = tmp_path / "config.json" + config = Config() + config.providers.xiaomi_mimo.api_key = "mimo-test" + save_config(config, config_path) + monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path) + + payload = settings_payload() + + providers = {provider["name"]: provider for provider in payload["transcription"]["providers"]} + assert providers["xiaomi_mimo"]["label"] == "Xiaomi MIMO" + assert providers["xiaomi_mimo"]["configured"] is True + + def test_update_transcription_settings_writes_top_level_only( tmp_path, monkeypatch: pytest.MonkeyPatch, @@ -342,6 +359,32 @@ def test_update_transcription_settings_accepts_openrouter( assert payload["transcription"]["provider_configured"] is True +def test_update_transcription_settings_accepts_xiaomi_mimo( + tmp_path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + config_path = tmp_path / "config.json" + config = Config() + config.providers.xiaomi_mimo.api_key = "mimo-test" + save_config(config, config_path) + monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path) + + payload = update_transcription_settings( + { + "provider": ["xiaomi_mimo"], + "model": ["mimo-v2.5-asr"], + "language": ["zh"], + } + ) + + saved = load_config(config_path) + assert saved.transcription.provider == "xiaomi_mimo" + assert saved.transcription.model == "mimo-v2.5-asr" + assert saved.transcription.language == "zh" + assert payload["transcription"]["provider"] == "xiaomi_mimo" + assert payload["transcription"]["provider_configured"] is True + + def test_update_transcription_settings_validates_language( tmp_path, monkeypatch: pytest.MonkeyPatch,