mirror of
https://github.com/HKUDS/nanobot.git
synced 2026-06-15 15:24:06 +00:00
feat(transcription): add Xiaomi MiMo ASR provider (mimo-v2.5-asr)
Add support for Xiaomi MiMo ASR as a third transcription backend alongside Groq and OpenAI Whisper. Xiaomi ASR uses the /v1/chat/completions endpoint with base64-encoded audio input, rather than the standard Whisper multipart upload format. Co-Authored-By:连 <lian@tangping.homes>
This commit is contained in:
parent
552ec18a3c
commit
c20ecc52d7
@ -119,7 +119,7 @@ ANTHROPIC_API_KEY="$(bw get password api/anthropic)" nanobot agent
|
|||||||
## Providers
|
## Providers
|
||||||
|
|
||||||
> [!TIP]
|
> [!TIP]
|
||||||
> - **Voice transcription**: Voice messages and WebUI/desktop microphone input use the shared top-level `transcription` settings. By default Groq Whisper is used; set `transcription.provider` to `"openai"` for OpenAI Whisper or `"openrouter"` for OpenRouter speech-to-text models. API keys still live in the matching `providers.<provider>` config.
|
> - **Voice transcription**: Voice messages and WebUI/desktop microphone input use the shared top-level `transcription` settings. By default Groq Whisper is used; set `transcription.provider` to `"openai"` for OpenAI Whisper, `"openrouter"` for OpenRouter speech-to-text models, or `"xiaomi_mimo"` for Xiaomi MiMo ASR. API keys still live in the matching `providers.<provider>` config.
|
||||||
> - **MiniMax Coding Plan**: Exclusive discount links for the nanobot community: [Overseas](https://platform.minimax.io/subscribe/coding-plan?code=9txpdXw04g&source=link) · [Mainland China](https://platform.minimaxi.com/subscribe/token-plan?code=GILTJpMTqZ&source=link)
|
> - **MiniMax Coding Plan**: Exclusive discount links for the nanobot community: [Overseas](https://platform.minimax.io/subscribe/coding-plan?code=9txpdXw04g&source=link) · [Mainland China](https://platform.minimaxi.com/subscribe/token-plan?code=GILTJpMTqZ&source=link)
|
||||||
> - **MiniMax (Mainland China)**: If your API key is from MiniMax's mainland China platform (minimaxi.com), set `"apiBase": "https://api.minimaxi.com/v1"` in your minimax provider config.
|
> - **MiniMax (Mainland China)**: If your API key is from MiniMax's mainland China platform (minimaxi.com), set `"apiBase": "https://api.minimaxi.com/v1"` in your minimax provider config.
|
||||||
> - **MiniMax thinking mode**: Use `providers.minimaxAnthropic` when you want `reasoningEffort` / thinking mode. MiniMax exposes that capability through its Anthropic-compatible endpoint, so nanobot keeps it as a separate provider instead of guessing MiniMax-specific thinking parameters on the generic OpenAI-compatible `minimax` endpoint. It uses the same `MINIMAX_API_KEY`. Default Anthropic-compatible base URL: `https://api.minimax.io/anthropic`; for mainland China use `https://api.minimaxi.com/anthropic`.
|
> - **MiniMax thinking mode**: Use `providers.minimaxAnthropic` when you want `reasoningEffort` / thinking mode. MiniMax exposes that capability through its Anthropic-compatible endpoint, so nanobot keeps it as a separate provider instead of guessing MiniMax-specific thinking parameters on the generic OpenAI-compatible `minimax` endpoint. It uses the same `MINIMAX_API_KEY`. Default Anthropic-compatible base URL: `https://api.minimax.io/anthropic`; for mainland China use `https://api.minimaxi.com/anthropic`.
|
||||||
@ -1122,8 +1122,8 @@ Configure transcription under the top-level `transcription` section:
|
|||||||
| Setting | Default | Description |
|
| Setting | Default | Description |
|
||||||
|---------|---------|-------------|
|
|---------|---------|-------------|
|
||||||
| `enabled` | `true` | Enables audio transcription for both chat-channel voice messages and WebUI/desktop microphone input. |
|
| `enabled` | `true` | Enables audio transcription for both chat-channel voice messages and WebUI/desktop microphone input. |
|
||||||
| `provider` | `"groq"` | Transcription backend: `"groq"`, `"openai"`, or `"openrouter"`. |
|
| `provider` | `"groq"` | Transcription backend: `"groq"`, `"openai"`, `"openrouter"`, or `"xiaomi_mimo"`. |
|
||||||
| `model` | provider default | Optional transcription model override. Defaults to `whisper-large-v3` for Groq, `whisper-1` for OpenAI, and `openai/whisper-1` for OpenRouter. OpenRouter accepts only speech-to-text models on its transcription endpoint, such as `nvidia/parakeet-tdt-0.6b-v3`, `openai/whisper-1`, or `openai/gpt-4o-transcribe`; chat LLMs are rejected there. |
|
| `model` | provider default | Optional transcription model override. Defaults to `whisper-large-v3` for Groq, `whisper-1` for OpenAI, `openai/whisper-1` for OpenRouter, and `mimo-v2.5-asr` for Xiaomi MiMo ASR. OpenRouter accepts only speech-to-text models on its transcription endpoint, such as `nvidia/parakeet-tdt-0.6b-v3`, `openai/whisper-1`, or `openai/gpt-4o-transcribe`; chat LLMs are rejected there. |
|
||||||
| `language` | `null` | Optional ISO-639 language hint, e.g. `"en"`, `"zh"`, `"ko"`, or `"ja"`. |
|
| `language` | `null` | Optional ISO-639 language hint, e.g. `"en"`, `"zh"`, `"ko"`, or `"ja"`. |
|
||||||
| `maxDurationSec` | `120` | Maximum WebUI/desktop recording duration. |
|
| `maxDurationSec` | `120` | Maximum WebUI/desktop recording duration. |
|
||||||
| `maxUploadMb` | `25` | Maximum WebUI/desktop audio upload size. |
|
| `maxUploadMb` | `25` | Maximum WebUI/desktop audio upload size. |
|
||||||
|
|||||||
@ -18,13 +18,18 @@ from loguru import logger
|
|||||||
from nanobot.config.paths import get_media_dir
|
from nanobot.config.paths import get_media_dir
|
||||||
from nanobot.utils.media_decode import FileSizeExceeded, save_base64_data_url
|
from nanobot.utils.media_decode import FileSizeExceeded, save_base64_data_url
|
||||||
|
|
||||||
TranscriptionProviderName = Literal["groq", "openai", "openrouter"]
|
TranscriptionProviderName = Literal["groq", "openai", "openrouter", "xiaomi_mimo"]
|
||||||
|
|
||||||
_DEFAULT_PROVIDER: TranscriptionProviderName = "groq"
|
_DEFAULT_PROVIDER: TranscriptionProviderName = "groq"
|
||||||
_DEFAULT_MODELS: dict[TranscriptionProviderName, str] = {
|
_DEFAULT_MODELS: dict[TranscriptionProviderName, str] = {
|
||||||
"groq": "whisper-large-v3",
|
"groq": "whisper-large-v3",
|
||||||
"openai": "whisper-1",
|
"openai": "whisper-1",
|
||||||
"openrouter": "openai/whisper-1",
|
"openrouter": "openai/whisper-1",
|
||||||
|
"xiaomi_mimo": "mimo-v2.5-asr",
|
||||||
|
}
|
||||||
|
_PROVIDER_ALIASES: dict[str, TranscriptionProviderName] = {
|
||||||
|
"mimo": "xiaomi_mimo",
|
||||||
|
"xiaomi": "xiaomi_mimo",
|
||||||
}
|
}
|
||||||
_MAX_AUDIO_BYTES_FALLBACK = 25 * 1024 * 1024
|
_MAX_AUDIO_BYTES_FALLBACK = 25 * 1024 * 1024
|
||||||
_AUDIO_MIME_ALLOWED: frozenset[str] = frozenset({
|
_AUDIO_MIME_ALLOWED: frozenset[str] = frozenset({
|
||||||
@ -69,6 +74,8 @@ class TranscriptionIngressError(Exception):
|
|||||||
def _as_provider(value: Any) -> TranscriptionProviderName | None:
|
def _as_provider(value: Any) -> TranscriptionProviderName | None:
|
||||||
if isinstance(value, str):
|
if isinstance(value, str):
|
||||||
name = value.strip().lower()
|
name = value.strip().lower()
|
||||||
|
if name in _PROVIDER_ALIASES:
|
||||||
|
return _PROVIDER_ALIASES[name]
|
||||||
if name in _DEFAULT_MODELS:
|
if name in _DEFAULT_MODELS:
|
||||||
return name # type: ignore[return-value]
|
return name # type: ignore[return-value]
|
||||||
return None
|
return None
|
||||||
@ -181,6 +188,15 @@ async def transcribe_audio_file(
|
|||||||
language=config.language,
|
language=config.language,
|
||||||
model=config.model,
|
model=config.model,
|
||||||
)
|
)
|
||||||
|
elif config.provider == "xiaomi_mimo":
|
||||||
|
from nanobot.providers.transcription import XiaomiMiMoTranscriptionProvider
|
||||||
|
|
||||||
|
provider = XiaomiMiMoTranscriptionProvider(
|
||||||
|
api_key=config.api_key,
|
||||||
|
api_base=config.api_base or None,
|
||||||
|
language=config.language,
|
||||||
|
model=config.model,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
from nanobot.providers.transcription import GroqTranscriptionProvider
|
from nanobot.providers.transcription import GroqTranscriptionProvider
|
||||||
|
|
||||||
|
|||||||
@ -47,7 +47,7 @@ class TranscriptionConfig(Base):
|
|||||||
"""Cross-channel audio transcription configuration."""
|
"""Cross-channel audio transcription configuration."""
|
||||||
|
|
||||||
enabled: bool = True
|
enabled: bool = True
|
||||||
provider: Literal["groq", "openai", "openrouter"] | None = None
|
provider: Literal["groq", "openai", "openrouter", "xiaomi_mimo"] | None = None
|
||||||
model: str | None = None
|
model: str | None = None
|
||||||
language: str | None = Field(default=None, pattern=r"^[a-z]{2,3}$")
|
language: str | None = Field(default=None, pattern=r"^[a-z]{2,3}$")
|
||||||
max_duration_sec: int = Field(default=120, ge=1, le=600)
|
max_duration_sec: int = Field(default=120, ge=1, le=600)
|
||||||
|
|||||||
@ -1,8 +1,9 @@
|
|||||||
"""Provider-specific voice transcription adapters.
|
"""Provider-specific voice transcription adapters.
|
||||||
|
|
||||||
This module only knows how to call external transcription APIs such as Groq,
|
This module only knows how to call external transcription APIs such as Groq,
|
||||||
OpenAI Whisper, and OpenRouter. Product-level config fallback, WebUI upload
|
OpenAI Whisper, OpenRouter, and Xiaomi MiMo ASR. Product-level config fallback,
|
||||||
validation, and channel integration live in ``nanobot.audio.transcription``.
|
WebUI upload validation, and channel integration live in
|
||||||
|
``nanobot.audio.transcription``.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
@ -16,6 +17,7 @@ from typing import Any
|
|||||||
import httpx
|
import httpx
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
|
_CHAT_COMPLETIONS_PATH = "chat/completions"
|
||||||
_TRANSCRIPTIONS_PATH = "audio/transcriptions"
|
_TRANSCRIPTIONS_PATH = "audio/transcriptions"
|
||||||
_AUDIO_MIME_OVERRIDES = {
|
_AUDIO_MIME_OVERRIDES = {
|
||||||
".m4a": "audio/mp4",
|
".m4a": "audio/mp4",
|
||||||
@ -51,6 +53,16 @@ def _resolve_transcription_url(api_base: str | None, default_url: str) -> str:
|
|||||||
return f"{base}/{_TRANSCRIPTIONS_PATH}"
|
return f"{base}/{_TRANSCRIPTIONS_PATH}"
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_chat_completions_url(api_base: str | None, default_url: str) -> str:
|
||||||
|
"""Resolve a chat-completions endpoint for ASR providers using chat payloads."""
|
||||||
|
if not api_base:
|
||||||
|
return default_url
|
||||||
|
base = api_base.rstrip("/")
|
||||||
|
if base.endswith(_CHAT_COMPLETIONS_PATH):
|
||||||
|
return base
|
||||||
|
return f"{base}/{_CHAT_COMPLETIONS_PATH}"
|
||||||
|
|
||||||
|
|
||||||
def _audio_mime_type(path: Path) -> str:
|
def _audio_mime_type(path: Path) -> str:
|
||||||
return (
|
return (
|
||||||
_AUDIO_MIME_OVERRIDES.get(path.suffix.lower())
|
_AUDIO_MIME_OVERRIDES.get(path.suffix.lower())
|
||||||
@ -116,7 +128,7 @@ async def _post_transcription_with_retry(
|
|||||||
files["language"] = (None, language)
|
files["language"] = (None, language)
|
||||||
return {"url": url, "headers": headers, "files": files, "timeout": 60.0}
|
return {"url": url, "headers": headers, "files": files, "timeout": 60.0}
|
||||||
|
|
||||||
return await _post_with_retry(build_request, provider_label)
|
return await _post_with_retry(build_request, provider_label, _text_from_transcription_payload)
|
||||||
|
|
||||||
|
|
||||||
async def _post_json_transcription_with_retry(
|
async def _post_json_transcription_with_retry(
|
||||||
@ -151,12 +163,61 @@ async def _post_json_transcription_with_retry(
|
|||||||
body["language"] = language
|
body["language"] = language
|
||||||
return {"url": url, "headers": headers, "json": body, "timeout": 60.0}
|
return {"url": url, "headers": headers, "json": body, "timeout": 60.0}
|
||||||
|
|
||||||
return await _post_with_retry(build_request, provider_label)
|
return await _post_with_retry(build_request, provider_label, _text_from_transcription_payload)
|
||||||
|
|
||||||
|
|
||||||
|
async def _post_xiaomi_mimo_asr_with_retry(
|
||||||
|
url: str,
|
||||||
|
*,
|
||||||
|
api_key: str | None,
|
||||||
|
path: Path,
|
||||||
|
model: str,
|
||||||
|
provider_label: str,
|
||||||
|
language: str | None = None,
|
||||||
|
) -> str:
|
||||||
|
"""POST audio to Xiaomi MiMo ASR's chat-completions transcription API."""
|
||||||
|
try:
|
||||||
|
data = path.read_bytes()
|
||||||
|
except OSError as e:
|
||||||
|
logger.exception("{} transcription error: cannot read audio file: {}", provider_label, e)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
body: dict[str, Any] = {
|
||||||
|
"model": model,
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "input_audio",
|
||||||
|
"input_audio": {
|
||||||
|
"data": (
|
||||||
|
f"data:{_audio_mime_type(path)};base64,"
|
||||||
|
f"{base64.b64encode(data).decode('ascii')}"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
if language:
|
||||||
|
body["asr_options"] = {"language": language}
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {api_key}",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
}
|
||||||
|
|
||||||
|
def build_request() -> dict[str, Any]:
|
||||||
|
return {"url": url, "headers": headers, "json": body, "timeout": 60.0}
|
||||||
|
|
||||||
|
return await _post_with_retry(build_request, provider_label, _text_from_chat_payload)
|
||||||
|
|
||||||
|
|
||||||
async def _post_with_retry(
|
async def _post_with_retry(
|
||||||
build_request: Callable[[], dict[str, Any]],
|
build_request: Callable[[], dict[str, Any]],
|
||||||
provider_label: str,
|
provider_label: str,
|
||||||
|
extract_text: Callable[[dict[str, Any]], str],
|
||||||
) -> str:
|
) -> str:
|
||||||
async with httpx.AsyncClient() as client:
|
async with httpx.AsyncClient() as client:
|
||||||
for attempt in range(_MAX_RETRIES + 1):
|
for attempt in range(_MAX_RETRIES + 1):
|
||||||
@ -227,10 +288,23 @@ async def _post_with_retry(
|
|||||||
type(payload).__name__,
|
type(payload).__name__,
|
||||||
)
|
)
|
||||||
return ""
|
return ""
|
||||||
return payload.get("text", "")
|
return extract_text(payload)
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _text_from_transcription_payload(payload: dict[str, Any]) -> str:
|
||||||
|
text = payload.get("text")
|
||||||
|
return text if isinstance(text, str) else ""
|
||||||
|
|
||||||
|
|
||||||
|
def _text_from_chat_payload(payload: dict[str, Any]) -> str:
|
||||||
|
try:
|
||||||
|
text = payload["choices"][0]["message"]["content"]
|
||||||
|
except (KeyError, IndexError, TypeError):
|
||||||
|
return ""
|
||||||
|
return text if isinstance(text, str) else ""
|
||||||
|
|
||||||
|
|
||||||
class OpenAITranscriptionProvider:
|
class OpenAITranscriptionProvider:
|
||||||
"""Voice transcription provider using OpenAI's Whisper API."""
|
"""Voice transcription provider using OpenAI's Whisper API."""
|
||||||
|
|
||||||
@ -357,3 +431,42 @@ class OpenRouterTranscriptionProvider:
|
|||||||
provider_label="OpenRouter",
|
provider_label="OpenRouter",
|
||||||
language=self.language,
|
language=self.language,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class XiaomiMiMoTranscriptionProvider:
|
||||||
|
"""Voice transcription provider using Xiaomi MiMo ASR."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
api_key: str | None = None,
|
||||||
|
api_base: str | None = None,
|
||||||
|
language: str | None = None,
|
||||||
|
model: str | None = None,
|
||||||
|
):
|
||||||
|
self.api_key = api_key or os.environ.get("MIMO_API_KEY")
|
||||||
|
self.api_url = _resolve_chat_completions_url(
|
||||||
|
api_base or os.environ.get("MIMO_API_BASE"),
|
||||||
|
"https://api.xiaomimimo.com/v1/chat/completions",
|
||||||
|
)
|
||||||
|
self.language = language or None
|
||||||
|
self.model = model or "mimo-v2.5-asr"
|
||||||
|
logger.debug("Xiaomi MiMo transcription endpoint: {}", self.api_url)
|
||||||
|
|
||||||
|
async def transcribe(self, file_path: str | Path) -> str:
|
||||||
|
if not self.api_key:
|
||||||
|
logger.warning("Xiaomi MiMo API key not configured for transcription")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
path = Path(file_path)
|
||||||
|
if not path.exists():
|
||||||
|
logger.error("Audio file not found: {}", file_path)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
return await _post_xiaomi_mimo_asr_with_retry(
|
||||||
|
self.api_url,
|
||||||
|
api_key=self.api_key,
|
||||||
|
path=path,
|
||||||
|
model=self.model,
|
||||||
|
provider_label="Xiaomi MiMo",
|
||||||
|
language=self.language,
|
||||||
|
)
|
||||||
|
|||||||
@ -91,7 +91,7 @@ _IMAGE_GENERATION_ASPECT_RATIOS = {
|
|||||||
"2:3",
|
"2:3",
|
||||||
"21:9",
|
"21:9",
|
||||||
}
|
}
|
||||||
_TRANSCRIPTION_PROVIDERS = ("groq", "openai", "openrouter")
|
_TRANSCRIPTION_PROVIDERS = ("groq", "openai", "openrouter", "xiaomi_mimo")
|
||||||
_CONTEXT_WINDOW_TOKEN_OPTIONS = {65_536, 262_144}
|
_CONTEXT_WINDOW_TOKEN_OPTIONS = {65_536, 262_144}
|
||||||
_MODEL_CONFIGURATION_SLUG_RE = re.compile(r"[^a-z0-9_-]+")
|
_MODEL_CONFIGURATION_SLUG_RE = re.compile(r"[^a-z0-9_-]+")
|
||||||
_ENV_REF_RE = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}")
|
_ENV_REF_RE = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}")
|
||||||
|
|||||||
@ -19,7 +19,9 @@ from nanobot.providers.transcription import (
|
|||||||
GroqTranscriptionProvider,
|
GroqTranscriptionProvider,
|
||||||
OpenAITranscriptionProvider,
|
OpenAITranscriptionProvider,
|
||||||
OpenRouterTranscriptionProvider,
|
OpenRouterTranscriptionProvider,
|
||||||
|
XiaomiMiMoTranscriptionProvider,
|
||||||
_audio_format,
|
_audio_format,
|
||||||
|
_resolve_chat_completions_url,
|
||||||
_resolve_transcription_url,
|
_resolve_transcription_url,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -95,6 +97,37 @@ def test_resolver_supports_openrouter_transcription_provider() -> None:
|
|||||||
assert resolved.api_base == "https://openrouter.ai/api/v1"
|
assert resolved.api_base == "https://openrouter.ai/api/v1"
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolver_supports_xiaomi_mimo_transcription_provider() -> None:
|
||||||
|
config = Config()
|
||||||
|
config.transcription.provider = "xiaomi_mimo"
|
||||||
|
config.transcription.model = "mimo-v2.5-asr"
|
||||||
|
config.transcription.language = "zh"
|
||||||
|
config.providers.xiaomi_mimo.api_key = "mimo-test"
|
||||||
|
config.providers.xiaomi_mimo.api_base = "https://api.xiaomimimo.com/v1"
|
||||||
|
|
||||||
|
resolved = resolve_transcription_config(config)
|
||||||
|
|
||||||
|
assert resolved.provider == "xiaomi_mimo"
|
||||||
|
assert resolved.model == "mimo-v2.5-asr"
|
||||||
|
assert resolved.language == "zh"
|
||||||
|
assert resolved.api_key == "mimo-test"
|
||||||
|
assert resolved.api_base == "https://api.xiaomimimo.com/v1"
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolver_accepts_legacy_xiaomi_transcription_alias() -> None:
|
||||||
|
config = Config()
|
||||||
|
config.channels.transcription_provider = "xiaomi"
|
||||||
|
config.channels.transcription_language = "zh"
|
||||||
|
config.providers.xiaomi_mimo.api_key = "mimo-test"
|
||||||
|
|
||||||
|
resolved = resolve_transcription_config(config)
|
||||||
|
|
||||||
|
assert resolved.provider == "xiaomi_mimo"
|
||||||
|
assert resolved.model == "mimo-v2.5-asr"
|
||||||
|
assert resolved.language == "zh"
|
||||||
|
assert resolved.api_key == "mimo-test"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_transcribe_audio_file_routes_openrouter_provider(audio_file: Path) -> None:
|
async def test_transcribe_audio_file_routes_openrouter_provider(audio_file: Path) -> None:
|
||||||
captured: dict[str, object] = {}
|
captured: dict[str, object] = {}
|
||||||
@ -131,6 +164,42 @@ async def test_transcribe_audio_file_routes_openrouter_provider(audio_file: Path
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_transcribe_audio_file_routes_xiaomi_mimo_provider(audio_file: Path) -> None:
|
||||||
|
captured: dict[str, object] = {}
|
||||||
|
|
||||||
|
class StubXiaomiMiMo:
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
captured.update(kwargs)
|
||||||
|
|
||||||
|
async def transcribe(self, file_path: str | Path) -> str:
|
||||||
|
captured["file_path"] = Path(file_path)
|
||||||
|
return "mimo ok"
|
||||||
|
|
||||||
|
config = EffectiveTranscriptionConfig(
|
||||||
|
enabled=True,
|
||||||
|
provider="xiaomi_mimo",
|
||||||
|
model="mimo-v2.5-asr",
|
||||||
|
language="zh",
|
||||||
|
api_key="mimo-test",
|
||||||
|
api_base="https://api.xiaomimimo.com/v1",
|
||||||
|
max_duration_sec=120,
|
||||||
|
max_upload_mb=25,
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("nanobot.providers.transcription.XiaomiMiMoTranscriptionProvider", StubXiaomiMiMo):
|
||||||
|
result = await transcribe_audio_file(audio_file, config)
|
||||||
|
|
||||||
|
assert result == "mimo ok"
|
||||||
|
assert captured == {
|
||||||
|
"api_key": "mimo-test",
|
||||||
|
"api_base": "https://api.xiaomimimo.com/v1",
|
||||||
|
"language": "zh",
|
||||||
|
"model": "mimo-v2.5-asr",
|
||||||
|
"file_path": audio_file,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_resolved_transcription_repr_hides_api_key() -> None:
|
def test_resolved_transcription_repr_hides_api_key() -> None:
|
||||||
config = Config()
|
config = Config()
|
||||||
config.providers.groq.api_key = "gsk-secret"
|
config.providers.groq.api_key = "gsk-secret"
|
||||||
@ -496,6 +565,69 @@ async def test_openrouter_shares_retry_contract(audio_file: Path) -> None:
|
|||||||
assert post.await_count == 2
|
assert post.await_count == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_chat_completions_url_appends_path_to_base() -> None:
|
||||||
|
default = "https://api.xiaomimimo.com/v1/chat/completions"
|
||||||
|
assert _resolve_chat_completions_url(None, default) == default
|
||||||
|
assert (
|
||||||
|
_resolve_chat_completions_url("https://api.xiaomimimo.com/v1", default)
|
||||||
|
== "https://api.xiaomimimo.com/v1/chat/completions"
|
||||||
|
)
|
||||||
|
assert _resolve_chat_completions_url(default, "https://x/chat/completions") == default
|
||||||
|
|
||||||
|
|
||||||
|
def test_xiaomi_mimo_defaults_and_base_normalization() -> None:
|
||||||
|
provider = XiaomiMiMoTranscriptionProvider(api_key="k")
|
||||||
|
assert provider.api_url == "https://api.xiaomimimo.com/v1/chat/completions"
|
||||||
|
assert provider.model == "mimo-v2.5-asr"
|
||||||
|
|
||||||
|
custom = XiaomiMiMoTranscriptionProvider(
|
||||||
|
api_key="k",
|
||||||
|
api_base="https://token-plan-sgp.xiaomimimo.com/v1",
|
||||||
|
model="custom-asr",
|
||||||
|
)
|
||||||
|
assert custom.api_url == "https://token-plan-sgp.xiaomimimo.com/v1/chat/completions"
|
||||||
|
assert custom.model == "custom-asr"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_xiaomi_mimo_sends_chat_completion_audio_payload(audio_file: Path) -> None:
|
||||||
|
provider = XiaomiMiMoTranscriptionProvider(api_key="k", language="zh")
|
||||||
|
post = AsyncMock(
|
||||||
|
return_value=_response(
|
||||||
|
200,
|
||||||
|
{"choices": [{"message": {"content": "你好"}}]},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("httpx.AsyncClient.post", post), patch("asyncio.sleep", AsyncMock()):
|
||||||
|
assert await provider.transcribe(audio_file) == "你好"
|
||||||
|
|
||||||
|
call = post.await_args_list[0].kwargs
|
||||||
|
assert "files" not in call
|
||||||
|
body = call["json"]
|
||||||
|
assert body["model"] == "mimo-v2.5-asr"
|
||||||
|
assert body["asr_options"] == {"language": "zh"}
|
||||||
|
audio = body["messages"][0]["content"][0]["input_audio"]["data"]
|
||||||
|
assert audio.startswith("data:audio/ogg;base64,")
|
||||||
|
assert base64.b64decode(audio.split(",", 1)[1]) == audio_file.read_bytes()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_xiaomi_mimo_shares_retry_contract(audio_file: Path) -> None:
|
||||||
|
provider = XiaomiMiMoTranscriptionProvider(api_key="k")
|
||||||
|
post = AsyncMock(
|
||||||
|
side_effect=[
|
||||||
|
_response(503),
|
||||||
|
_response(200, {"choices": [{"message": {"content": "ok"}}]}),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("httpx.AsyncClient.post", post), patch("asyncio.sleep", AsyncMock()):
|
||||||
|
assert await provider.transcribe(audio_file) == "ok"
|
||||||
|
|
||||||
|
assert post.await_count == 2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("status", [408, 429, 500, 502, 503, 504])
|
@pytest.mark.parametrize("status", [408, 429, 500, 502, 503, 504])
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_retries_on_every_advertised_transient_status(
|
async def test_retries_on_every_advertised_transient_status(
|
||||||
|
|||||||
@ -282,6 +282,23 @@ def test_settings_payload_exposes_openrouter_transcription_provider(
|
|||||||
assert providers["openrouter"]["configured"] is True
|
assert providers["openrouter"]["configured"] is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_settings_payload_exposes_xiaomi_mimo_transcription_provider(
|
||||||
|
tmp_path,
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
) -> None:
|
||||||
|
config_path = tmp_path / "config.json"
|
||||||
|
config = Config()
|
||||||
|
config.providers.xiaomi_mimo.api_key = "mimo-test"
|
||||||
|
save_config(config, config_path)
|
||||||
|
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
|
||||||
|
|
||||||
|
payload = settings_payload()
|
||||||
|
|
||||||
|
providers = {provider["name"]: provider for provider in payload["transcription"]["providers"]}
|
||||||
|
assert providers["xiaomi_mimo"]["label"] == "Xiaomi MIMO"
|
||||||
|
assert providers["xiaomi_mimo"]["configured"] is True
|
||||||
|
|
||||||
|
|
||||||
def test_update_transcription_settings_writes_top_level_only(
|
def test_update_transcription_settings_writes_top_level_only(
|
||||||
tmp_path,
|
tmp_path,
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
@ -342,6 +359,32 @@ def test_update_transcription_settings_accepts_openrouter(
|
|||||||
assert payload["transcription"]["provider_configured"] is True
|
assert payload["transcription"]["provider_configured"] is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_update_transcription_settings_accepts_xiaomi_mimo(
|
||||||
|
tmp_path,
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
) -> None:
|
||||||
|
config_path = tmp_path / "config.json"
|
||||||
|
config = Config()
|
||||||
|
config.providers.xiaomi_mimo.api_key = "mimo-test"
|
||||||
|
save_config(config, config_path)
|
||||||
|
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
|
||||||
|
|
||||||
|
payload = update_transcription_settings(
|
||||||
|
{
|
||||||
|
"provider": ["xiaomi_mimo"],
|
||||||
|
"model": ["mimo-v2.5-asr"],
|
||||||
|
"language": ["zh"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
saved = load_config(config_path)
|
||||||
|
assert saved.transcription.provider == "xiaomi_mimo"
|
||||||
|
assert saved.transcription.model == "mimo-v2.5-asr"
|
||||||
|
assert saved.transcription.language == "zh"
|
||||||
|
assert payload["transcription"]["provider"] == "xiaomi_mimo"
|
||||||
|
assert payload["transcription"]["provider_configured"] is True
|
||||||
|
|
||||||
|
|
||||||
def test_update_transcription_settings_validates_language(
|
def test_update_transcription_settings_validates_language(
|
||||||
tmp_path,
|
tmp_path,
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user