mirror of
https://github.com/HKUDS/nanobot.git
synced 2026-06-15 15:24:06 +00:00
feat(transcription): add AssemblyAI as transcription provider
Add AssemblyAI as a third transcription provider option alongside OpenAI and Groq. AssemblyAI offers better accuracy for certain audio types (distant voices, noisy environments) and serves as a reliable fallback when other providers struggle. Changes: - Add AssemblyAITranscriptionProvider class in providers/transcription.py - Add 'assemblyai' option in base channel's transcribe_audio() - Per-channel configuration via transcriptionProvider in config Usage: Set transcriptionProvider: 'assemblyai' and provide an AssemblyAI API key via transcriptionApiKey in the channel config.
This commit is contained in:
parent
f183b37542
commit
f3eb2aa08b
@ -28,9 +28,9 @@ Use these when you want deeper customization, integration, or extension details.
|
|||||||
|
|
||||||
| Topic | Repo docs | What it covers |
|
| Topic | Repo docs | What it covers |
|
||||||
|---|---|---|
|
|---|---|---|
|
||||||
|
| Development | [`development.md`](./development.md) | Contributor notes for adding providers and transcription adapters |
|
||||||
| Memory | [`memory.md`](./memory.md) | How nanobot stores, consolidates, and restores memory |
|
| Memory | [`memory.md`](./memory.md) | How nanobot stores, consolidates, and restores memory |
|
||||||
| Python SDK | [`python-sdk.md`](./python-sdk.md) | Use nanobot programmatically from Python |
|
| Python SDK | [`python-sdk.md`](./python-sdk.md) | Use nanobot programmatically from Python |
|
||||||
| Channel plugin guide | [`channel-plugin-guide.md`](./channel-plugin-guide.md) | Build and test custom chat channel plugins |
|
| Channel plugin guide | [`channel-plugin-guide.md`](./channel-plugin-guide.md) | Build and test custom chat channel plugins |
|
||||||
| WebSocket channel | [`websocket.md`](./websocket.md) | Real-time WebSocket access and protocol details |
|
| WebSocket channel | [`websocket.md`](./websocket.md) | Real-time WebSocket access and protocol details |
|
||||||
| Custom tools | [`my-tool.md`](./my-tool.md) | Inspect and tune runtime state with the `my` tool |
|
| Custom tools | [`my-tool.md`](./my-tool.md) | Inspect and tune runtime state with the `my` tool |
|
||||||
|
|
||||||
|
|||||||
@ -119,7 +119,7 @@ ANTHROPIC_API_KEY="$(bw get password api/anthropic)" nanobot agent
|
|||||||
## Providers
|
## Providers
|
||||||
|
|
||||||
> [!TIP]
|
> [!TIP]
|
||||||
> - **Voice transcription**: Voice messages and WebUI/desktop microphone input use the shared top-level `transcription` settings. By default Groq Whisper is used; set `transcription.provider` to `"openai"` for OpenAI Whisper, `"openrouter"` for OpenRouter speech-to-text models, or `"xiaomi_mimo"` for Xiaomi MiMo ASR. API keys still live in the matching `providers.<provider>` config.
|
> - **Voice transcription**: Voice messages and WebUI/desktop microphone input use the shared top-level `transcription` settings. By default Groq Whisper is used; set `transcription.provider` to `"openai"` for OpenAI Whisper, `"openrouter"` for OpenRouter speech-to-text models, `"xiaomi_mimo"` for Xiaomi MiMo ASR, or `"assemblyai"` for AssemblyAI. API keys still live in the matching `providers.<provider>` config.
|
||||||
> - **MiniMax Coding Plan**: Exclusive discount links for the nanobot community: [Overseas](https://platform.minimax.io/subscribe/coding-plan?code=9txpdXw04g&source=link) · [Mainland China](https://platform.minimaxi.com/subscribe/token-plan?code=GILTJpMTqZ&source=link)
|
> - **MiniMax Coding Plan**: Exclusive discount links for the nanobot community: [Overseas](https://platform.minimax.io/subscribe/coding-plan?code=9txpdXw04g&source=link) · [Mainland China](https://platform.minimaxi.com/subscribe/token-plan?code=GILTJpMTqZ&source=link)
|
||||||
> - **MiniMax (Mainland China)**: If your API key is from MiniMax's mainland China platform (minimaxi.com), set `"apiBase": "https://api.minimaxi.com/v1"` in your minimax provider config.
|
> - **MiniMax (Mainland China)**: If your API key is from MiniMax's mainland China platform (minimaxi.com), set `"apiBase": "https://api.minimaxi.com/v1"` in your minimax provider config.
|
||||||
> - **MiniMax thinking mode**: Use `providers.minimaxAnthropic` when you want `reasoningEffort` / thinking mode. MiniMax exposes that capability through its Anthropic-compatible endpoint, so nanobot keeps it as a separate provider instead of guessing MiniMax-specific thinking parameters on the generic OpenAI-compatible `minimax` endpoint. It uses the same `MINIMAX_API_KEY`. Default Anthropic-compatible base URL: `https://api.minimax.io/anthropic`; for mainland China use `https://api.minimaxi.com/anthropic`.
|
> - **MiniMax thinking mode**: Use `providers.minimaxAnthropic` when you want `reasoningEffort` / thinking mode. MiniMax exposes that capability through its Anthropic-compatible endpoint, so nanobot keeps it as a separate provider instead of guessing MiniMax-specific thinking parameters on the generic OpenAI-compatible `minimax` endpoint. It uses the same `MINIMAX_API_KEY`. Default Anthropic-compatible base URL: `https://api.minimax.io/anthropic`; for mainland China use `https://api.minimaxi.com/anthropic`.
|
||||||
@ -143,6 +143,7 @@ ANTHROPIC_API_KEY="$(bw get password api/anthropic)" nanobot agent
|
|||||||
| `azure_openai` | LLM (Azure OpenAI) | [portal.azure.com](https://portal.azure.com) |
|
| `azure_openai` | LLM (Azure OpenAI) | [portal.azure.com](https://portal.azure.com) |
|
||||||
| `bedrock` | LLM (AWS Bedrock Converse, Claude/Nova/Llama/etc.) | [aws.amazon.com/bedrock](https://aws.amazon.com/bedrock/) |
|
| `bedrock` | LLM (AWS Bedrock Converse, Claude/Nova/Llama/etc.) | [aws.amazon.com/bedrock](https://aws.amazon.com/bedrock/) |
|
||||||
| `openai` | LLM + Voice transcription (Whisper) | [platform.openai.com](https://platform.openai.com) |
|
| `openai` | LLM + Voice transcription (Whisper) | [platform.openai.com](https://platform.openai.com) |
|
||||||
|
| `assemblyai` | Voice transcription only | [assemblyai.com](https://www.assemblyai.com/) |
|
||||||
| `deepseek` | LLM (DeepSeek direct) | [platform.deepseek.com](https://platform.deepseek.com) |
|
| `deepseek` | LLM (DeepSeek direct) | [platform.deepseek.com](https://platform.deepseek.com) |
|
||||||
| `groq` | LLM + Voice transcription (Whisper, default) | [console.groq.com](https://console.groq.com) |
|
| `groq` | LLM + Voice transcription (Whisper, default) | [console.groq.com](https://console.groq.com) |
|
||||||
| `minimax` | LLM (MiniMax direct) | [platform.minimaxi.com](https://platform.minimaxi.com) |
|
| `minimax` | LLM (MiniMax direct) | [platform.minimaxi.com](https://platform.minimaxi.com) |
|
||||||
@ -957,48 +958,8 @@ vllm serve meta-llama/Llama-3.1-8B-Instruct --port 8000
|
|||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
<details>
|
Contributor notes for adding new providers live in
|
||||||
<summary><b>Adding a New Provider (Developer Guide)</b></summary>
|
[`development.md`](./development.md#adding-an-llm-provider).
|
||||||
|
|
||||||
nanobot uses a **Provider Registry** (`nanobot/providers/registry.py`) as the single source of truth.
|
|
||||||
Adding a new provider only takes **2 steps** — no if-elif chains to touch.
|
|
||||||
|
|
||||||
**Step 1.** Add a `ProviderSpec` entry to `PROVIDERS` in `nanobot/providers/registry.py`:
|
|
||||||
|
|
||||||
```python
|
|
||||||
ProviderSpec(
|
|
||||||
name="myprovider", # config field name
|
|
||||||
keywords=("myprovider", "mymodel"), # model-name keywords for auto-matching
|
|
||||||
env_key="MYPROVIDER_API_KEY", # env var name
|
|
||||||
display_name="My Provider", # shown in `nanobot status`
|
|
||||||
default_api_base="https://api.myprovider.com/v1", # OpenAI-compatible endpoint
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Step 2.** Add a field to `ProvidersConfig` in `nanobot/config/schema.py`:
|
|
||||||
|
|
||||||
```python
|
|
||||||
class ProvidersConfig(BaseModel):
|
|
||||||
...
|
|
||||||
myprovider: ProviderConfig = ProviderConfig()
|
|
||||||
```
|
|
||||||
|
|
||||||
That's it! Environment variables, model routing, config matching, and `nanobot status` display will all work automatically.
|
|
||||||
|
|
||||||
**Common `ProviderSpec` options:**
|
|
||||||
|
|
||||||
| Field | Description | Example |
|
|
||||||
|-------|-------------|---------|
|
|
||||||
| `default_api_base` | OpenAI-compatible base URL | `"https://api.deepseek.com"` |
|
|
||||||
| `env_extras` | Additional env vars to set | `(("ZHIPUAI_API_KEY", "{api_key}"),)` |
|
|
||||||
| `model_overrides` | Per-model parameter overrides | `(("kimi-k2.5", {"temperature": 1.0}), ("kimi-k2.6", {"temperature": 1.0}),)` |
|
|
||||||
| `is_gateway` | Can route any model (like OpenRouter) | `True` |
|
|
||||||
| `detect_by_key_prefix` | Detect gateway by API key prefix | `"sk-or-"` |
|
|
||||||
| `detect_by_base_keyword` | Detect gateway by API base URL | `"openrouter"` |
|
|
||||||
| `strip_model_prefix` | Strip provider prefix before sending to gateway | `True` (for AiHubMix) |
|
|
||||||
| `supports_max_completion_tokens` | Use `max_completion_tokens` instead of `max_tokens`; required for providers that reject both being set simultaneously (e.g. VolcEngine) | `True` |
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
## Model Presets
|
## Model Presets
|
||||||
|
|
||||||
@ -1122,8 +1083,8 @@ Configure transcription under the top-level `transcription` section:
|
|||||||
| Setting | Default | Description |
|
| Setting | Default | Description |
|
||||||
|---------|---------|-------------|
|
|---------|---------|-------------|
|
||||||
| `enabled` | `true` | Enables audio transcription for both chat-channel voice messages and WebUI/desktop microphone input. |
|
| `enabled` | `true` | Enables audio transcription for both chat-channel voice messages and WebUI/desktop microphone input. |
|
||||||
| `provider` | `"groq"` | Transcription backend: `"groq"`, `"openai"`, `"openrouter"`, or `"xiaomi_mimo"`. |
|
| `provider` | `"groq"` | Transcription backend: `"groq"`, `"openai"`, `"openrouter"`, `"xiaomi_mimo"`, or `"assemblyai"`. |
|
||||||
| `model` | provider default | Optional transcription model override. Defaults to `whisper-large-v3` for Groq, `whisper-1` for OpenAI, `openai/whisper-1` for OpenRouter, and `mimo-v2.5-asr` for Xiaomi MiMo ASR. OpenRouter accepts only speech-to-text models on its transcription endpoint, such as `nvidia/parakeet-tdt-0.6b-v3`, `openai/whisper-1`, or `openai/gpt-4o-transcribe`; chat LLMs are rejected there. |
|
| `model` | provider default | Optional transcription model override. Defaults to `whisper-large-v3` for Groq, `whisper-1` for OpenAI, `openai/whisper-1` for OpenRouter, `mimo-v2.5-asr` for Xiaomi MiMo ASR, and `universal-3-pro,universal-2` for AssemblyAI. OpenRouter accepts only speech-to-text models on its transcription endpoint, such as `nvidia/parakeet-tdt-0.6b-v3`, `openai/whisper-1`, or `openai/gpt-4o-transcribe`; chat LLMs are rejected there. AssemblyAI accepts a comma-separated model fallback list. |
|
||||||
| `language` | `null` | Optional ISO-639 language hint, e.g. `"en"`, `"zh"`, `"ko"`, or `"ja"`. |
|
| `language` | `null` | Optional ISO-639 language hint, e.g. `"en"`, `"zh"`, `"ko"`, or `"ja"`. |
|
||||||
| `maxDurationSec` | `120` | Maximum WebUI/desktop recording duration. |
|
| `maxDurationSec` | `120` | Maximum WebUI/desktop recording duration. |
|
||||||
| `maxUploadMb` | `25` | Maximum WebUI/desktop audio upload size. |
|
| `maxUploadMb` | `25` | Maximum WebUI/desktop audio upload size. |
|
||||||
@ -1155,6 +1116,9 @@ Transcription credentials are intentionally not stored in `transcription`. Put t
|
|||||||
|
|
||||||
Selecting a transcription provider does not configure credentials by itself. For example, the effective provider may default to Groq for compatibility, but transcription is only usable when `providers.groq.apiKey` or the matching environment-backed config is available. The Settings UI writes only the top-level `transcription` fields.
|
Selecting a transcription provider does not configure credentials by itself. For example, the effective provider may default to Groq for compatibility, but transcription is only usable when `providers.groq.apiKey` or the matching environment-backed config is available. The Settings UI writes only the top-level `transcription` fields.
|
||||||
|
|
||||||
|
If you are adding a new transcription provider, see
|
||||||
|
[`development.md`](./development.md#adding-a-transcription-provider).
|
||||||
|
|
||||||
## Channel Settings
|
## Channel Settings
|
||||||
|
|
||||||
Global settings that apply to all channels. Configure under the `channels` section in `~/.nanobot/config.json`:
|
Global settings that apply to all channels. Configure under the `channels` section in `~/.nanobot/config.json`:
|
||||||
|
|||||||
132
docs/development.md
Normal file
132
docs/development.md
Normal file
@ -0,0 +1,132 @@
|
|||||||
|
# Development
|
||||||
|
|
||||||
|
This page collects contributor-facing notes for extending nanobot. User-facing setup
|
||||||
|
and runtime options live in [`configuration.md`](./configuration.md).
|
||||||
|
|
||||||
|
## Adding an LLM Provider
|
||||||
|
|
||||||
|
nanobot uses the provider registry in `nanobot/providers/registry.py` as the
|
||||||
|
source of truth for LLM provider metadata. Most OpenAI-compatible providers need
|
||||||
|
only two changes.
|
||||||
|
|
||||||
|
1. Add a `ProviderSpec` entry to `PROVIDERS`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
ProviderSpec(
|
||||||
|
name="myprovider",
|
||||||
|
keywords=("myprovider", "mymodel"),
|
||||||
|
env_key="MYPROVIDER_API_KEY",
|
||||||
|
display_name="My Provider",
|
||||||
|
default_api_base="https://api.myprovider.com/v1",
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Add a field to `ProvidersConfig` in `nanobot/config/schema.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
class ProvidersConfig(BaseModel):
|
||||||
|
...
|
||||||
|
myprovider: ProviderConfig = Field(default_factory=ProviderConfig)
|
||||||
|
```
|
||||||
|
|
||||||
|
Environment variables, config matching, provider status, and WebUI credential
|
||||||
|
display derive from those two entries.
|
||||||
|
|
||||||
|
Useful `ProviderSpec` options:
|
||||||
|
|
||||||
|
| Field | Description |
|
||||||
|
|---|---|
|
||||||
|
| `default_api_base` | Default OpenAI-compatible base URL. |
|
||||||
|
| `env_extras` | Additional environment variables derived from the provider config. |
|
||||||
|
| `model_overrides` | Per-model request parameter overrides. |
|
||||||
|
| `is_gateway` | Provider can route many model families, like OpenRouter. |
|
||||||
|
| `detect_by_key_prefix` | Match configured gateways by API-key prefix. |
|
||||||
|
| `detect_by_base_keyword` | Match configured gateways by API base URL. |
|
||||||
|
| `strip_model_prefix` | Strip `provider/` before sending the model to the upstream API. |
|
||||||
|
| `supports_max_completion_tokens` | Use `max_completion_tokens` instead of `max_tokens`. |
|
||||||
|
| `is_transcription_only` | Provider has credentials but cannot serve chat completions. |
|
||||||
|
|
||||||
|
## Adding a Transcription Provider
|
||||||
|
|
||||||
|
Transcription is intentionally split into two layers:
|
||||||
|
|
||||||
|
- `nanobot/audio/transcription_registry.py` owns provider names, aliases, default
|
||||||
|
models, and adapter loading.
|
||||||
|
- `nanobot/providers/transcription.py` owns provider-specific HTTP behavior.
|
||||||
|
|
||||||
|
Credentials still live under `providers.<provider>` so chat channels, WebUI, and
|
||||||
|
desktop resolve API keys and API bases the same way.
|
||||||
|
|
||||||
|
1. Add provider credentials to `ProvidersConfig`.
|
||||||
|
|
||||||
|
```python
|
||||||
|
class ProvidersConfig(BaseModel):
|
||||||
|
...
|
||||||
|
my_stt: ProviderConfig = Field(default_factory=ProviderConfig)
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Add a `ProviderSpec` in `nanobot/providers/registry.py`.
|
||||||
|
|
||||||
|
For transcription-only providers, set `is_transcription_only=True` so they show up
|
||||||
|
in credential/settings surfaces but stay out of chat model selection.
|
||||||
|
|
||||||
|
```python
|
||||||
|
ProviderSpec(
|
||||||
|
name="my_stt",
|
||||||
|
keywords=("my_stt",),
|
||||||
|
env_key="MY_STT_API_KEY",
|
||||||
|
display_name="My STT",
|
||||||
|
default_api_base="https://api.example.com/v1",
|
||||||
|
is_transcription_only=True,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Add an adapter class in `nanobot/providers/transcription.py`.
|
||||||
|
|
||||||
|
Adapters receive resolved credentials and settings. They return an empty string
|
||||||
|
for provider errors so channel voice messages fail quietly instead of crashing the
|
||||||
|
agent loop.
|
||||||
|
|
||||||
|
```python
|
||||||
|
class MySTTTranscriptionProvider:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
api_key: str | None = None,
|
||||||
|
api_base: str | None = None,
|
||||||
|
language: str | None = None,
|
||||||
|
model: str | None = None,
|
||||||
|
):
|
||||||
|
self.api_key = api_key or os.environ.get("MY_STT_API_KEY")
|
||||||
|
self.api_base = api_base or "https://api.example.com/v1"
|
||||||
|
self.language = language or None
|
||||||
|
self.model = model or "my-default-stt-model"
|
||||||
|
|
||||||
|
async def transcribe(self, file_path: str | Path) -> str:
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Register the adapter in `nanobot/audio/transcription_registry.py`.
|
||||||
|
|
||||||
|
```python
|
||||||
|
TranscriptionProviderSpec(
|
||||||
|
name="my_stt",
|
||||||
|
default_model="my-default-stt-model",
|
||||||
|
adapter="nanobot.providers.transcription:MySTTTranscriptionProvider",
|
||||||
|
aliases=("mystt",),
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Add tests.
|
||||||
|
|
||||||
|
At minimum, cover:
|
||||||
|
|
||||||
|
- config resolution in `tests/providers/test_transcription.py`
|
||||||
|
- adapter request/response behavior and retry/error handling
|
||||||
|
- WebUI settings payload/update behavior in `tests/webui/test_settings_api.py`
|
||||||
|
- provider brand mapping if the provider appears in Settings
|
||||||
|
|
||||||
|
6. Update user-facing docs.
|
||||||
|
|
||||||
|
Add the provider to [`configuration.md`](./configuration.md) where users choose
|
||||||
|
`transcription.provider`, but keep implementation details in this development
|
||||||
|
guide.
|
||||||
@ -11,26 +11,20 @@ from __future__ import annotations
|
|||||||
from contextlib import suppress
|
from contextlib import suppress
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Literal
|
from typing import Any
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
|
from nanobot.audio.transcription_registry import (
|
||||||
|
get_transcription_provider,
|
||||||
|
resolve_transcription_provider,
|
||||||
|
)
|
||||||
from nanobot.config.paths import get_media_dir
|
from nanobot.config.paths import get_media_dir
|
||||||
from nanobot.utils.media_decode import FileSizeExceeded, save_base64_data_url
|
from nanobot.utils.media_decode import FileSizeExceeded, save_base64_data_url
|
||||||
|
|
||||||
TranscriptionProviderName = Literal["groq", "openai", "openrouter", "xiaomi_mimo"]
|
TranscriptionProviderName = str
|
||||||
|
|
||||||
_DEFAULT_PROVIDER: TranscriptionProviderName = "groq"
|
_DEFAULT_PROVIDER: TranscriptionProviderName = "groq"
|
||||||
_DEFAULT_MODELS: dict[TranscriptionProviderName, str] = {
|
|
||||||
"groq": "whisper-large-v3",
|
|
||||||
"openai": "whisper-1",
|
|
||||||
"openrouter": "openai/whisper-1",
|
|
||||||
"xiaomi_mimo": "mimo-v2.5-asr",
|
|
||||||
}
|
|
||||||
_PROVIDER_ALIASES: dict[str, TranscriptionProviderName] = {
|
|
||||||
"mimo": "xiaomi_mimo",
|
|
||||||
"xiaomi": "xiaomi_mimo",
|
|
||||||
}
|
|
||||||
_MAX_AUDIO_BYTES_FALLBACK = 25 * 1024 * 1024
|
_MAX_AUDIO_BYTES_FALLBACK = 25 * 1024 * 1024
|
||||||
_AUDIO_MIME_ALLOWED: frozenset[str] = frozenset({
|
_AUDIO_MIME_ALLOWED: frozenset[str] = frozenset({
|
||||||
"audio/aac",
|
"audio/aac",
|
||||||
@ -72,13 +66,8 @@ class TranscriptionIngressError(Exception):
|
|||||||
|
|
||||||
|
|
||||||
def _as_provider(value: Any) -> TranscriptionProviderName | None:
|
def _as_provider(value: Any) -> TranscriptionProviderName | None:
|
||||||
if isinstance(value, str):
|
spec = resolve_transcription_provider(value)
|
||||||
name = value.strip().lower()
|
return spec.name if spec else None
|
||||||
if name in _PROVIDER_ALIASES:
|
|
||||||
return _PROVIDER_ALIASES[name]
|
|
||||||
if name in _DEFAULT_MODELS:
|
|
||||||
return name # type: ignore[return-value]
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _provider_config(config: Any, provider: str) -> Any:
|
def _provider_config(config: Any, provider: str) -> Any:
|
||||||
@ -101,11 +90,17 @@ def resolve_transcription_config(config: Any) -> EffectiveTranscriptionConfig:
|
|||||||
or _as_provider(getattr(channels, "transcription_provider", None))
|
or _as_provider(getattr(channels, "transcription_provider", None))
|
||||||
or _DEFAULT_PROVIDER
|
or _DEFAULT_PROVIDER
|
||||||
)
|
)
|
||||||
|
spec = get_transcription_provider(provider)
|
||||||
|
if spec is None:
|
||||||
|
logger.warning("Unknown transcription provider {}; falling back to {}", provider, _DEFAULT_PROVIDER)
|
||||||
|
provider = _DEFAULT_PROVIDER
|
||||||
|
spec = get_transcription_provider(provider)
|
||||||
|
default_model = spec.default_model if spec else ""
|
||||||
provider_cfg = _provider_config(config, provider)
|
provider_cfg = _provider_config(config, provider)
|
||||||
return EffectiveTranscriptionConfig(
|
return EffectiveTranscriptionConfig(
|
||||||
enabled=bool(getattr(top, "enabled", True)),
|
enabled=bool(getattr(top, "enabled", True)),
|
||||||
provider=provider,
|
provider=provider,
|
||||||
model=(getattr(top, "model", None) or _DEFAULT_MODELS[provider]).strip(),
|
model=(getattr(top, "model", None) or default_model).strip(),
|
||||||
language=getattr(top, "language", None) or getattr(channels, "transcription_language", None),
|
language=getattr(top, "language", None) or getattr(channels, "transcription_language", None),
|
||||||
api_key=getattr(provider_cfg, "api_key", None) or "",
|
api_key=getattr(provider_cfg, "api_key", None) or "",
|
||||||
api_base=getattr(provider_cfg, "api_base", None) or "",
|
api_base=getattr(provider_cfg, "api_base", None) or "",
|
||||||
@ -170,40 +165,14 @@ async def transcribe_audio_file(
|
|||||||
"""Transcribe *file_path* using the already-resolved transcription config."""
|
"""Transcribe *file_path* using the already-resolved transcription config."""
|
||||||
if not config.enabled or not config.configured:
|
if not config.enabled or not config.configured:
|
||||||
return ""
|
return ""
|
||||||
if config.provider == "openai":
|
spec = get_transcription_provider(config.provider)
|
||||||
from nanobot.providers.transcription import OpenAITranscriptionProvider
|
if spec is None:
|
||||||
|
logger.warning("Unknown transcription provider: {}", config.provider)
|
||||||
provider = OpenAITranscriptionProvider(
|
return ""
|
||||||
api_key=config.api_key,
|
provider = spec.load_adapter()(
|
||||||
api_base=config.api_base or None,
|
api_key=config.api_key,
|
||||||
language=config.language,
|
api_base=config.api_base or None,
|
||||||
model=config.model,
|
language=config.language,
|
||||||
)
|
model=config.model,
|
||||||
elif config.provider == "openrouter":
|
)
|
||||||
from nanobot.providers.transcription import OpenRouterTranscriptionProvider
|
|
||||||
|
|
||||||
provider = OpenRouterTranscriptionProvider(
|
|
||||||
api_key=config.api_key,
|
|
||||||
api_base=config.api_base or None,
|
|
||||||
language=config.language,
|
|
||||||
model=config.model,
|
|
||||||
)
|
|
||||||
elif config.provider == "xiaomi_mimo":
|
|
||||||
from nanobot.providers.transcription import XiaomiMiMoTranscriptionProvider
|
|
||||||
|
|
||||||
provider = XiaomiMiMoTranscriptionProvider(
|
|
||||||
api_key=config.api_key,
|
|
||||||
api_base=config.api_base or None,
|
|
||||||
language=config.language,
|
|
||||||
model=config.model,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
from nanobot.providers.transcription import GroqTranscriptionProvider
|
|
||||||
|
|
||||||
provider = GroqTranscriptionProvider(
|
|
||||||
api_key=config.api_key,
|
|
||||||
api_base=config.api_base or None,
|
|
||||||
language=config.language,
|
|
||||||
model=config.model,
|
|
||||||
)
|
|
||||||
return await provider.transcribe(file_path)
|
return await provider.transcribe(file_path)
|
||||||
|
|||||||
90
nanobot/audio/transcription_registry.py
Normal file
90
nanobot/audio/transcription_registry.py
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
"""Registry for speech-to-text providers.
|
||||||
|
|
||||||
|
Provider-specific HTTP adapters live in ``nanobot.providers.transcription``.
|
||||||
|
This module is the app-level source of truth for provider names, aliases,
|
||||||
|
default models, and adapter class paths.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from importlib import import_module
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Protocol
|
||||||
|
|
||||||
|
|
||||||
|
class TranscriptionProviderAdapter(Protocol):
|
||||||
|
"""Runtime protocol implemented by provider-specific transcription adapters."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
api_key: str | None = None,
|
||||||
|
api_base: str | None = None,
|
||||||
|
language: str | None = None,
|
||||||
|
model: str | None = None,
|
||||||
|
) -> None: ...
|
||||||
|
|
||||||
|
async def transcribe(self, file_path: str | Path) -> str: ...
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class TranscriptionProviderSpec:
|
||||||
|
name: str
|
||||||
|
default_model: str
|
||||||
|
adapter: str
|
||||||
|
aliases: tuple[str, ...] = ()
|
||||||
|
|
||||||
|
def load_adapter(self) -> type[TranscriptionProviderAdapter]:
|
||||||
|
module_name, _, class_name = self.adapter.partition(":")
|
||||||
|
if not module_name or not class_name:
|
||||||
|
raise RuntimeError(f"Invalid transcription adapter path: {self.adapter}")
|
||||||
|
adapter = getattr(import_module(module_name), class_name)
|
||||||
|
return adapter
|
||||||
|
|
||||||
|
|
||||||
|
TRANSCRIPTION_PROVIDERS: tuple[TranscriptionProviderSpec, ...] = (
|
||||||
|
TranscriptionProviderSpec(
|
||||||
|
name="groq",
|
||||||
|
default_model="whisper-large-v3",
|
||||||
|
adapter="nanobot.providers.transcription:GroqTranscriptionProvider",
|
||||||
|
),
|
||||||
|
TranscriptionProviderSpec(
|
||||||
|
name="openai",
|
||||||
|
default_model="whisper-1",
|
||||||
|
adapter="nanobot.providers.transcription:OpenAITranscriptionProvider",
|
||||||
|
),
|
||||||
|
TranscriptionProviderSpec(
|
||||||
|
name="openrouter",
|
||||||
|
default_model="openai/whisper-1",
|
||||||
|
adapter="nanobot.providers.transcription:OpenRouterTranscriptionProvider",
|
||||||
|
),
|
||||||
|
TranscriptionProviderSpec(
|
||||||
|
name="xiaomi_mimo",
|
||||||
|
default_model="mimo-v2.5-asr",
|
||||||
|
adapter="nanobot.providers.transcription:XiaomiMiMoTranscriptionProvider",
|
||||||
|
aliases=("mimo", "xiaomi"),
|
||||||
|
),
|
||||||
|
TranscriptionProviderSpec(
|
||||||
|
name="assemblyai",
|
||||||
|
default_model="universal-3-pro,universal-2",
|
||||||
|
adapter="nanobot.providers.transcription:AssemblyAITranscriptionProvider",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
_BY_NAME = {spec.name: spec for spec in TRANSCRIPTION_PROVIDERS}
|
||||||
|
_BY_ALIAS = {alias: spec for spec in TRANSCRIPTION_PROVIDERS for alias in spec.aliases}
|
||||||
|
|
||||||
|
|
||||||
|
def transcription_provider_names() -> tuple[str, ...]:
|
||||||
|
return tuple(spec.name for spec in TRANSCRIPTION_PROVIDERS)
|
||||||
|
|
||||||
|
|
||||||
|
def get_transcription_provider(name: str) -> TranscriptionProviderSpec | None:
|
||||||
|
return _BY_NAME.get(name)
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_transcription_provider(value: Any) -> TranscriptionProviderSpec | None:
|
||||||
|
if not isinstance(value, str):
|
||||||
|
return None
|
||||||
|
name = value.strip().lower()
|
||||||
|
return _BY_NAME.get(name) or _BY_ALIAS.get(name)
|
||||||
@ -47,7 +47,7 @@ class TranscriptionConfig(Base):
|
|||||||
"""Cross-channel audio transcription configuration."""
|
"""Cross-channel audio transcription configuration."""
|
||||||
|
|
||||||
enabled: bool = True
|
enabled: bool = True
|
||||||
provider: Literal["groq", "openai", "openrouter", "xiaomi_mimo"] | None = None
|
provider: str | None = None # Validated by nanobot.audio.transcription_registry.
|
||||||
model: str | None = None
|
model: str | None = None
|
||||||
language: str | None = Field(default=None, pattern=r"^[a-z]{2,3}$")
|
language: str | None = Field(default=None, pattern=r"^[a-z]{2,3}$")
|
||||||
max_duration_sec: int = Field(default=120, ge=1, le=600)
|
max_duration_sec: int = Field(default=120, ge=1, le=600)
|
||||||
@ -202,6 +202,7 @@ class ProvidersConfig(Base):
|
|||||||
anthropic: ProviderConfig = Field(default_factory=ProviderConfig)
|
anthropic: ProviderConfig = Field(default_factory=ProviderConfig)
|
||||||
openai: ProviderConfig = Field(default_factory=ProviderConfig)
|
openai: ProviderConfig = Field(default_factory=ProviderConfig)
|
||||||
openrouter: ProviderConfig = Field(default_factory=ProviderConfig)
|
openrouter: ProviderConfig = Field(default_factory=ProviderConfig)
|
||||||
|
assemblyai: ProviderConfig = Field(default_factory=ProviderConfig) # AssemblyAI voice transcription
|
||||||
huggingface: ProviderConfig = Field(default_factory=ProviderConfig)
|
huggingface: ProviderConfig = Field(default_factory=ProviderConfig)
|
||||||
skywork: ProviderConfig = Field(default_factory=ProviderConfig) # Skywork / APIFree API gateway
|
skywork: ProviderConfig = Field(default_factory=ProviderConfig) # Skywork / APIFree API gateway
|
||||||
deepseek: ProviderConfig = Field(default_factory=ProviderConfig)
|
deepseek: ProviderConfig = Field(default_factory=ProviderConfig)
|
||||||
@ -402,6 +403,8 @@ class Config(BaseSettings):
|
|||||||
|
|
||||||
# Explicit provider prefix wins — prevents `github-copilot/...codex` matching openai_codex.
|
# Explicit provider prefix wins — prevents `github-copilot/...codex` matching openai_codex.
|
||||||
for spec in PROVIDERS:
|
for spec in PROVIDERS:
|
||||||
|
if spec.is_transcription_only:
|
||||||
|
continue
|
||||||
p = getattr(self.providers, spec.name, None)
|
p = getattr(self.providers, spec.name, None)
|
||||||
if p and model_prefix and normalized_prefix == spec.name:
|
if p and model_prefix and normalized_prefix == spec.name:
|
||||||
if spec.is_oauth or spec.is_local or spec.is_direct or p.api_key:
|
if spec.is_oauth or spec.is_local or spec.is_direct or p.api_key:
|
||||||
@ -409,6 +412,8 @@ class Config(BaseSettings):
|
|||||||
|
|
||||||
# Match by keyword (order follows PROVIDERS registry)
|
# Match by keyword (order follows PROVIDERS registry)
|
||||||
for spec in PROVIDERS:
|
for spec in PROVIDERS:
|
||||||
|
if spec.is_transcription_only:
|
||||||
|
continue
|
||||||
p = getattr(self.providers, spec.name, None)
|
p = getattr(self.providers, spec.name, None)
|
||||||
if p and any(_kw_matches(kw) for kw in spec.keywords):
|
if p and any(_kw_matches(kw) for kw in spec.keywords):
|
||||||
if spec.is_oauth or spec.is_local or spec.is_direct or p.api_key:
|
if spec.is_oauth or spec.is_local or spec.is_direct or p.api_key:
|
||||||
@ -435,7 +440,7 @@ class Config(BaseSettings):
|
|||||||
# Fallback: gateways first, then others (follows registry order)
|
# Fallback: gateways first, then others (follows registry order)
|
||||||
# OAuth providers are NOT valid fallbacks — they require explicit model selection
|
# OAuth providers are NOT valid fallbacks — they require explicit model selection
|
||||||
for spec in PROVIDERS:
|
for spec in PROVIDERS:
|
||||||
if spec.is_oauth:
|
if spec.is_oauth or spec.is_transcription_only:
|
||||||
continue
|
continue
|
||||||
p = getattr(self.providers, spec.name, None)
|
p = getattr(self.providers, spec.name, None)
|
||||||
if p and p.api_key:
|
if p and p.api_key:
|
||||||
|
|||||||
@ -41,6 +41,8 @@ def _make_provider_core(
|
|||||||
provider_name = config.get_provider_name(model, preset=resolved)
|
provider_name = config.get_provider_name(model, preset=resolved)
|
||||||
p = config.get_provider(model, preset=resolved)
|
p = config.get_provider(model, preset=resolved)
|
||||||
spec = find_by_name(provider_name) if provider_name else None
|
spec = find_by_name(provider_name) if provider_name else None
|
||||||
|
if spec and spec.is_transcription_only:
|
||||||
|
raise ValueError(f"Provider '{provider_name}' only supports transcription.")
|
||||||
backend = spec.backend if spec else "openai_compat"
|
backend = spec.backend if spec else "openai_compat"
|
||||||
|
|
||||||
if backend == "azure_openai":
|
if backend == "azure_openai":
|
||||||
|
|||||||
@ -60,6 +60,9 @@ class ProviderSpec:
|
|||||||
# Direct providers skip API-key validation (user supplies everything)
|
# Direct providers skip API-key validation (user supplies everything)
|
||||||
is_direct: bool = False
|
is_direct: bool = False
|
||||||
|
|
||||||
|
# Provider is listed for shared credentials but cannot serve chat completions.
|
||||||
|
is_transcription_only: bool = False
|
||||||
|
|
||||||
# Provider supports cache_control on content blocks (e.g. Anthropic prompt caching)
|
# Provider supports cache_control on content blocks (e.g. Anthropic prompt caching)
|
||||||
supports_prompt_caching: bool = False
|
supports_prompt_caching: bool = False
|
||||||
|
|
||||||
@ -507,6 +510,17 @@ PROVIDERS: tuple[ProviderSpec, ...] = (
|
|||||||
backend="openai_compat",
|
backend="openai_compat",
|
||||||
default_api_base="https://api.groq.com/openai/v1",
|
default_api_base="https://api.groq.com/openai/v1",
|
||||||
),
|
),
|
||||||
|
# AssemblyAI: voice transcription only. It appears in provider settings so
|
||||||
|
# users can manage credentials, but WebUI excludes it from chat model pickers.
|
||||||
|
ProviderSpec(
|
||||||
|
name="assemblyai",
|
||||||
|
keywords=("assemblyai",),
|
||||||
|
env_key="ASSEMBLYAI_API_KEY",
|
||||||
|
display_name="AssemblyAI",
|
||||||
|
backend="openai_compat",
|
||||||
|
default_api_base="https://api.assemblyai.com/v2",
|
||||||
|
is_transcription_only=True,
|
||||||
|
),
|
||||||
# Qianfan (百度千帆): OpenAI-compatible API
|
# Qianfan (百度千帆): OpenAI-compatible API
|
||||||
ProviderSpec(
|
ProviderSpec(
|
||||||
name="qianfan",
|
name="qianfan",
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
"""Provider-specific voice transcription adapters.
|
"""Provider-specific voice transcription adapters.
|
||||||
|
|
||||||
This module only knows how to call external transcription APIs such as Groq,
|
This module only knows how to call external transcription APIs such as Groq,
|
||||||
OpenAI Whisper, OpenRouter, and Xiaomi MiMo ASR. Product-level config fallback,
|
OpenAI Whisper, OpenRouter, Xiaomi MiMo ASR, and AssemblyAI. Product-level config fallback,
|
||||||
WebUI upload validation, and channel integration live in
|
WebUI upload validation, and channel integration live in
|
||||||
``nanobot.audio.transcription``.
|
``nanobot.audio.transcription``.
|
||||||
"""
|
"""
|
||||||
@ -19,6 +19,9 @@ from loguru import logger
|
|||||||
|
|
||||||
_CHAT_COMPLETIONS_PATH = "chat/completions"
|
_CHAT_COMPLETIONS_PATH = "chat/completions"
|
||||||
_TRANSCRIPTIONS_PATH = "audio/transcriptions"
|
_TRANSCRIPTIONS_PATH = "audio/transcriptions"
|
||||||
|
_ASSEMBLYAI_DEFAULT_API_BASE = "https://api.assemblyai.com/v2"
|
||||||
|
_ASSEMBLYAI_POLL_ATTEMPTS = 60
|
||||||
|
_ASSEMBLYAI_POLL_INTERVAL_S = 2.0
|
||||||
_AUDIO_MIME_OVERRIDES = {
|
_AUDIO_MIME_OVERRIDES = {
|
||||||
".m4a": "audio/mp4",
|
".m4a": "audio/mp4",
|
||||||
".mpga": "audio/mpeg",
|
".mpga": "audio/mpeg",
|
||||||
@ -63,6 +66,11 @@ def _resolve_chat_completions_url(api_base: str | None, default_url: str) -> str
|
|||||||
return f"{base}/{_CHAT_COMPLETIONS_PATH}"
|
return f"{base}/{_CHAT_COMPLETIONS_PATH}"
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_api_path(api_base: str | None, default_base: str, path: str) -> str:
|
||||||
|
base = (api_base or default_base).rstrip("/")
|
||||||
|
return f"{base}/{path.lstrip('/')}"
|
||||||
|
|
||||||
|
|
||||||
def _audio_mime_type(path: Path) -> str:
|
def _audio_mime_type(path: Path) -> str:
|
||||||
return (
|
return (
|
||||||
_AUDIO_MIME_OVERRIDES.get(path.suffix.lower())
|
_AUDIO_MIME_OVERRIDES.get(path.suffix.lower())
|
||||||
@ -93,6 +101,90 @@ _RETRYABLE_EXCEPTIONS = (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def _request_json_with_retry(
|
||||||
|
client: httpx.AsyncClient,
|
||||||
|
method: str,
|
||||||
|
url: str,
|
||||||
|
*,
|
||||||
|
provider_label: str,
|
||||||
|
**kwargs: object,
|
||||||
|
) -> dict[str, Any] | None:
|
||||||
|
for attempt in range(_MAX_RETRIES + 1):
|
||||||
|
try:
|
||||||
|
request = getattr(client, method.lower(), None)
|
||||||
|
if request is None:
|
||||||
|
response = await client.request(method, url, **kwargs)
|
||||||
|
else:
|
||||||
|
response = await request(url, **kwargs)
|
||||||
|
except _RETRYABLE_EXCEPTIONS as e:
|
||||||
|
if attempt < _MAX_RETRIES:
|
||||||
|
logger.warning(
|
||||||
|
"{} transcription transient error (attempt {}/{}): {}",
|
||||||
|
provider_label,
|
||||||
|
attempt + 1,
|
||||||
|
_MAX_RETRIES + 1,
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
await asyncio.sleep(_BACKOFF_S[attempt])
|
||||||
|
continue
|
||||||
|
logger.exception(
|
||||||
|
"{} transcription error after {} attempts: {}",
|
||||||
|
provider_label,
|
||||||
|
_MAX_RETRIES + 1,
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("{} transcription error: {}", provider_label, e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
if response.status_code in _RETRYABLE_STATUS and attempt < _MAX_RETRIES:
|
||||||
|
logger.warning(
|
||||||
|
"{} transcription transient HTTP {} (attempt {}/{})",
|
||||||
|
provider_label,
|
||||||
|
response.status_code,
|
||||||
|
attempt + 1,
|
||||||
|
_MAX_RETRIES + 1,
|
||||||
|
)
|
||||||
|
await asyncio.sleep(_BACKOFF_S[attempt])
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
response.raise_for_status()
|
||||||
|
except httpx.HTTPStatusError:
|
||||||
|
body = response.text.strip().replace("\n", " ")[:500]
|
||||||
|
logger.error(
|
||||||
|
"{} transcription HTTP {}{}{}",
|
||||||
|
provider_label,
|
||||||
|
response.status_code,
|
||||||
|
f" {response.reason_phrase}" if response.reason_phrase else "",
|
||||||
|
f": {body}" if body else "",
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("{} transcription error: {}", provider_label, e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
payload = response.json()
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(
|
||||||
|
"{} transcription error: malformed response body: {}",
|
||||||
|
provider_label,
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
if not isinstance(payload, dict):
|
||||||
|
logger.error(
|
||||||
|
"{} transcription error: unexpected response shape: {!r}",
|
||||||
|
provider_label,
|
||||||
|
type(payload).__name__,
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
return payload
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
async def _post_transcription_with_retry(
|
async def _post_transcription_with_retry(
|
||||||
url: str,
|
url: str,
|
||||||
*,
|
*,
|
||||||
@ -305,6 +397,107 @@ def _text_from_chat_payload(payload: dict[str, Any]) -> str:
|
|||||||
return text if isinstance(text, str) else ""
|
return text if isinstance(text, str) else ""
|
||||||
|
|
||||||
|
|
||||||
|
def _assemblyai_speech_models(model: str | None) -> list[str]:
|
||||||
|
return [part for part in (part.strip() for part in (model or "").split(",")) if part]
|
||||||
|
|
||||||
|
|
||||||
|
class AssemblyAITranscriptionProvider:
|
||||||
|
"""Voice transcription provider using AssemblyAI's asynchronous REST API."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
api_key: str | None = None,
|
||||||
|
api_base: str | None = None,
|
||||||
|
language: str | None = None,
|
||||||
|
model: str | None = None,
|
||||||
|
):
|
||||||
|
base = api_base or os.environ.get("ASSEMBLYAI_BASE_URL")
|
||||||
|
self.api_key = api_key or os.environ.get("ASSEMBLYAI_API_KEY")
|
||||||
|
self.upload_url = _resolve_api_path(base, _ASSEMBLYAI_DEFAULT_API_BASE, "upload")
|
||||||
|
self.transcript_url = _resolve_api_path(base, _ASSEMBLYAI_DEFAULT_API_BASE, "transcript")
|
||||||
|
self.language = language or None
|
||||||
|
self.model = model or "universal-3-pro,universal-2"
|
||||||
|
logger.debug("AssemblyAI transcription endpoint: {}", self.transcript_url)
|
||||||
|
|
||||||
|
async def transcribe(self, file_path: str | Path) -> str:
|
||||||
|
if not self.api_key:
|
||||||
|
logger.warning("AssemblyAI API key not configured for transcription")
|
||||||
|
return ""
|
||||||
|
path = Path(file_path)
|
||||||
|
if not path.exists():
|
||||||
|
logger.error("Audio file not found: {}", file_path)
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
data = path.read_bytes()
|
||||||
|
except OSError as e:
|
||||||
|
logger.exception("AssemblyAI transcription error: cannot read audio file: {}", e)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
headers = {"Authorization": self.api_key}
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
upload = await _request_json_with_retry(
|
||||||
|
client,
|
||||||
|
"POST",
|
||||||
|
self.upload_url,
|
||||||
|
provider_label="AssemblyAI",
|
||||||
|
headers={**headers, "Content-Type": "application/octet-stream"},
|
||||||
|
content=data,
|
||||||
|
timeout=60.0,
|
||||||
|
)
|
||||||
|
upload_url = upload.get("upload_url") if upload else None
|
||||||
|
if not isinstance(upload_url, str) or not upload_url:
|
||||||
|
logger.error("AssemblyAI transcription error: upload_url missing")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
body: dict[str, object] = {"audio_url": upload_url}
|
||||||
|
speech_models = _assemblyai_speech_models(self.model)
|
||||||
|
if speech_models:
|
||||||
|
body["speech_models"] = speech_models
|
||||||
|
if self.language:
|
||||||
|
body["language_code"] = self.language
|
||||||
|
|
||||||
|
transcript = await _request_json_with_retry(
|
||||||
|
client,
|
||||||
|
"POST",
|
||||||
|
self.transcript_url,
|
||||||
|
provider_label="AssemblyAI",
|
||||||
|
headers=headers,
|
||||||
|
json=body,
|
||||||
|
timeout=30.0,
|
||||||
|
)
|
||||||
|
transcript_id = transcript.get("id") if transcript else None
|
||||||
|
if not isinstance(transcript_id, str) or not transcript_id:
|
||||||
|
logger.error("AssemblyAI transcription error: transcript id missing")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
poll_url = f"{self.transcript_url.rstrip('/')}/{transcript_id}"
|
||||||
|
for attempt in range(_ASSEMBLYAI_POLL_ATTEMPTS):
|
||||||
|
payload = await _request_json_with_retry(
|
||||||
|
client,
|
||||||
|
"GET",
|
||||||
|
poll_url,
|
||||||
|
provider_label="AssemblyAI",
|
||||||
|
headers=headers,
|
||||||
|
timeout=30.0,
|
||||||
|
)
|
||||||
|
if not payload:
|
||||||
|
return ""
|
||||||
|
status = str(payload.get("status") or "").lower()
|
||||||
|
if status == "completed":
|
||||||
|
text = payload.get("text")
|
||||||
|
return text if isinstance(text, str) else ""
|
||||||
|
if status in {"error", "failed"}:
|
||||||
|
logger.error(
|
||||||
|
"AssemblyAI transcription failed: {}",
|
||||||
|
payload.get("error") or payload,
|
||||||
|
)
|
||||||
|
return ""
|
||||||
|
if attempt < _ASSEMBLYAI_POLL_ATTEMPTS - 1:
|
||||||
|
await asyncio.sleep(_ASSEMBLYAI_POLL_INTERVAL_S)
|
||||||
|
logger.error("AssemblyAI transcription timed out while polling transcript")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
class OpenAITranscriptionProvider:
|
class OpenAITranscriptionProvider:
|
||||||
"""Voice transcription provider using OpenAI's Whisper API."""
|
"""Voice transcription provider using OpenAI's Whisper API."""
|
||||||
|
|
||||||
|
|||||||
@ -16,6 +16,10 @@ from zoneinfo import ZoneInfo
|
|||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
from nanobot.audio.transcription import resolve_transcription_config
|
from nanobot.audio.transcription import resolve_transcription_config
|
||||||
|
from nanobot.audio.transcription_registry import (
|
||||||
|
resolve_transcription_provider,
|
||||||
|
transcription_provider_names,
|
||||||
|
)
|
||||||
from nanobot.config.loader import get_config_path, load_config, save_config
|
from nanobot.config.loader import get_config_path, load_config, save_config
|
||||||
from nanobot.config.schema import ModelPresetConfig
|
from nanobot.config.schema import ModelPresetConfig
|
||||||
from nanobot.providers.image_generation import (
|
from nanobot.providers.image_generation import (
|
||||||
@ -91,7 +95,6 @@ _IMAGE_GENERATION_ASPECT_RATIOS = {
|
|||||||
"2:3",
|
"2:3",
|
||||||
"21:9",
|
"21:9",
|
||||||
}
|
}
|
||||||
_TRANSCRIPTION_PROVIDERS = ("groq", "openai", "openrouter", "xiaomi_mimo")
|
|
||||||
_CONTEXT_WINDOW_TOKEN_OPTIONS = {65_536, 262_144}
|
_CONTEXT_WINDOW_TOKEN_OPTIONS = {65_536, 262_144}
|
||||||
_MODEL_CONFIGURATION_SLUG_RE = re.compile(r"[^a-z0-9_-]+")
|
_MODEL_CONFIGURATION_SLUG_RE = re.compile(r"[^a-z0-9_-]+")
|
||||||
_ENV_REF_RE = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}")
|
_ENV_REF_RE = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}")
|
||||||
@ -424,9 +427,13 @@ def provider_models_payload(query: QueryParams) -> dict[str, Any]:
|
|||||||
"fetched_at": time.time(),
|
"fetched_at": time.time(),
|
||||||
}
|
}
|
||||||
if (
|
if (
|
||||||
spec.backend in _MODEL_LIST_UNSUPPORTED_BACKENDS
|
spec.is_transcription_only
|
||||||
and spec.name != "minimax_anthropic"
|
or (
|
||||||
) or spec.is_oauth:
|
spec.backend in _MODEL_LIST_UNSUPPORTED_BACKENDS
|
||||||
|
and spec.name != "minimax_anthropic"
|
||||||
|
)
|
||||||
|
or spec.is_oauth
|
||||||
|
):
|
||||||
return {
|
return {
|
||||||
**base_payload,
|
**base_payload,
|
||||||
"status": "unsupported",
|
"status": "unsupported",
|
||||||
@ -542,6 +549,8 @@ def _validate_configured_provider(config: Any, provider: str) -> None:
|
|||||||
spec = find_by_name(provider)
|
spec = find_by_name(provider)
|
||||||
if spec is None:
|
if spec is None:
|
||||||
raise WebUISettingsError("unknown provider")
|
raise WebUISettingsError("unknown provider")
|
||||||
|
if spec.is_transcription_only:
|
||||||
|
raise WebUISettingsError("provider does not support chat models")
|
||||||
provider_config = getattr(config.providers, provider, None)
|
provider_config = getattr(config.providers, provider, None)
|
||||||
if (
|
if (
|
||||||
provider_config is None
|
provider_config is None
|
||||||
@ -580,7 +589,7 @@ def _image_generation_provider_rows(config: Any) -> list[dict[str, Any]]:
|
|||||||
|
|
||||||
def _transcription_provider_rows(config: Any) -> list[dict[str, Any]]:
|
def _transcription_provider_rows(config: Any) -> list[dict[str, Any]]:
|
||||||
rows: list[dict[str, Any]] = []
|
rows: list[dict[str, Any]] = []
|
||||||
for name in _TRANSCRIPTION_PROVIDERS:
|
for name in transcription_provider_names():
|
||||||
spec = find_by_name(name)
|
spec = find_by_name(name)
|
||||||
provider_config = getattr(config.providers, name, None)
|
provider_config = getattr(config.providers, name, None)
|
||||||
rows.append({
|
rows.append({
|
||||||
@ -640,6 +649,7 @@ def settings_payload(
|
|||||||
"api_key_hint": _mask_secret_hint(provider_config.api_key),
|
"api_key_hint": _mask_secret_hint(provider_config.api_key),
|
||||||
"api_base": provider_config.api_base,
|
"api_base": provider_config.api_base,
|
||||||
"default_api_base": spec.default_api_base or None,
|
"default_api_base": spec.default_api_base or None,
|
||||||
|
"model_selectable": not spec.is_transcription_only,
|
||||||
}
|
}
|
||||||
if oauth_status is not None:
|
if oauth_status is not None:
|
||||||
row["oauth_account"] = oauth_status["account"]
|
row["oauth_account"] = oauth_status["account"]
|
||||||
@ -1357,10 +1367,12 @@ def update_transcription_settings(query: QueryParams) -> dict[str, Any]:
|
|||||||
provider = _query_first(query, "provider")
|
provider = _query_first(query, "provider")
|
||||||
if provider is not None:
|
if provider is not None:
|
||||||
provider = provider.strip().lower()
|
provider = provider.strip().lower()
|
||||||
if provider not in _TRANSCRIPTION_PROVIDERS:
|
provider_spec = resolve_transcription_provider(provider)
|
||||||
|
if provider_spec is None:
|
||||||
raise WebUISettingsError("unknown transcription provider")
|
raise WebUISettingsError("unknown transcription provider")
|
||||||
|
provider = provider_spec.name
|
||||||
if transcription.provider != provider:
|
if transcription.provider != provider:
|
||||||
transcription.provider = provider # type: ignore[assignment]
|
transcription.provider = provider
|
||||||
changed = True
|
changed = True
|
||||||
|
|
||||||
model = _query_first(query, "model")
|
model = _query_first(query, "model")
|
||||||
|
|||||||
@ -245,3 +245,18 @@ def test_match_provider_routes_forced_novita_model_api_models() -> None:
|
|||||||
|
|
||||||
assert config.get_provider_name() == "novita"
|
assert config.get_provider_name() == "novita"
|
||||||
assert config.get_api_base() == "https://api.novita.ai/openai"
|
assert config.get_api_base() == "https://api.novita.ai/openai"
|
||||||
|
|
||||||
|
|
||||||
|
def test_transcription_only_provider_is_not_chat_fallback() -> None:
|
||||||
|
config = Config.model_validate({
|
||||||
|
"providers": {
|
||||||
|
"assemblyai": {"apiKey": "aai-test"},
|
||||||
|
},
|
||||||
|
"agents": {
|
||||||
|
"defaults": {
|
||||||
|
"model": "assemblyai/universal-3-pro",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
assert config.get_provider_name() is None
|
||||||
|
|||||||
@ -14,8 +14,14 @@ from nanobot.audio.transcription import (
|
|||||||
resolve_transcription_config,
|
resolve_transcription_config,
|
||||||
transcribe_audio_file,
|
transcribe_audio_file,
|
||||||
)
|
)
|
||||||
|
from nanobot.audio.transcription_registry import (
|
||||||
|
get_transcription_provider,
|
||||||
|
resolve_transcription_provider,
|
||||||
|
transcription_provider_names,
|
||||||
|
)
|
||||||
from nanobot.config.schema import Config
|
from nanobot.config.schema import Config
|
||||||
from nanobot.providers.transcription import (
|
from nanobot.providers.transcription import (
|
||||||
|
AssemblyAITranscriptionProvider,
|
||||||
GroqTranscriptionProvider,
|
GroqTranscriptionProvider,
|
||||||
OpenAITranscriptionProvider,
|
OpenAITranscriptionProvider,
|
||||||
OpenRouterTranscriptionProvider,
|
OpenRouterTranscriptionProvider,
|
||||||
@ -44,6 +50,17 @@ def _raw_response(status: int, content: bytes) -> httpx.Response:
|
|||||||
return httpx.Response(status_code=status, content=content, request=request)
|
return httpx.Response(status_code=status, content=content, request=request)
|
||||||
|
|
||||||
|
|
||||||
|
def _json_response(
|
||||||
|
status: int,
|
||||||
|
payload: dict[str, object],
|
||||||
|
*,
|
||||||
|
method: str = "POST",
|
||||||
|
url: str = "https://example.test/audio/transcriptions",
|
||||||
|
) -> httpx.Response:
|
||||||
|
request = httpx.Request(method, url)
|
||||||
|
return httpx.Response(status_code=status, json=payload, request=request)
|
||||||
|
|
||||||
|
|
||||||
def test_resolver_uses_legacy_channel_provider_when_top_level_is_unset() -> None:
|
def test_resolver_uses_legacy_channel_provider_when_top_level_is_unset() -> None:
|
||||||
config = Config()
|
config = Config()
|
||||||
config.channels.transcription_provider = "openai"
|
config.channels.transcription_provider = "openai"
|
||||||
@ -128,6 +145,29 @@ def test_resolver_accepts_legacy_xiaomi_transcription_alias() -> None:
|
|||||||
assert resolved.api_key == "mimo-test"
|
assert resolved.api_key == "mimo-test"
|
||||||
|
|
||||||
|
|
||||||
|
def test_transcription_registry_lists_providers_and_aliases() -> None:
|
||||||
|
assert "assemblyai" in transcription_provider_names()
|
||||||
|
assert get_transcription_provider("assemblyai").default_model == "universal-3-pro,universal-2"
|
||||||
|
assert resolve_transcription_provider("mimo").name == "xiaomi_mimo"
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolver_supports_assemblyai_provider_config() -> None:
|
||||||
|
config = Config()
|
||||||
|
config.transcription.provider = "assemblyai"
|
||||||
|
config.transcription.model = "universal-3-pro"
|
||||||
|
config.transcription.language = "en"
|
||||||
|
config.providers.assemblyai.api_key = "aai-test"
|
||||||
|
config.providers.assemblyai.api_base = "https://assembly.example/v2"
|
||||||
|
|
||||||
|
resolved = resolve_transcription_config(config)
|
||||||
|
|
||||||
|
assert resolved.provider == "assemblyai"
|
||||||
|
assert resolved.model == "universal-3-pro"
|
||||||
|
assert resolved.language == "en"
|
||||||
|
assert resolved.api_key == "aai-test"
|
||||||
|
assert resolved.api_base == "https://assembly.example/v2"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_transcribe_audio_file_routes_openrouter_provider(audio_file: Path) -> None:
|
async def test_transcribe_audio_file_routes_openrouter_provider(audio_file: Path) -> None:
|
||||||
captured: dict[str, object] = {}
|
captured: dict[str, object] = {}
|
||||||
@ -200,6 +240,42 @@ async def test_transcribe_audio_file_routes_xiaomi_mimo_provider(audio_file: Pat
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_transcribe_audio_file_routes_assemblyai_provider(audio_file: Path) -> None:
|
||||||
|
captured: dict[str, object] = {}
|
||||||
|
|
||||||
|
class StubAssemblyAI:
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
captured.update(kwargs)
|
||||||
|
|
||||||
|
async def transcribe(self, file_path: str | Path) -> str:
|
||||||
|
captured["file_path"] = Path(file_path)
|
||||||
|
return "assembly ok"
|
||||||
|
|
||||||
|
config = EffectiveTranscriptionConfig(
|
||||||
|
enabled=True,
|
||||||
|
provider="assemblyai",
|
||||||
|
model="universal-3-pro",
|
||||||
|
language="en",
|
||||||
|
api_key="aai-test",
|
||||||
|
api_base="https://assembly.example/v2",
|
||||||
|
max_duration_sec=120,
|
||||||
|
max_upload_mb=25,
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("nanobot.providers.transcription.AssemblyAITranscriptionProvider", StubAssemblyAI):
|
||||||
|
result = await transcribe_audio_file(audio_file, config)
|
||||||
|
|
||||||
|
assert result == "assembly ok"
|
||||||
|
assert captured == {
|
||||||
|
"api_key": "aai-test",
|
||||||
|
"api_base": "https://assembly.example/v2",
|
||||||
|
"language": "en",
|
||||||
|
"model": "universal-3-pro",
|
||||||
|
"file_path": audio_file,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_resolved_transcription_repr_hides_api_key() -> None:
|
def test_resolved_transcription_repr_hides_api_key() -> None:
|
||||||
config = Config()
|
config = Config()
|
||||||
config.providers.groq.api_key = "gsk-secret"
|
config.providers.groq.api_key = "gsk-secret"
|
||||||
@ -628,6 +704,126 @@ async def test_xiaomi_mimo_shares_retry_contract(audio_file: Path) -> None:
|
|||||||
assert post.await_count == 2
|
assert post.await_count == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_assemblyai_defaults_and_base_normalization() -> None:
|
||||||
|
provider = AssemblyAITranscriptionProvider(api_key="aai-test")
|
||||||
|
assert provider.upload_url == "https://api.assemblyai.com/v2/upload"
|
||||||
|
assert provider.transcript_url == "https://api.assemblyai.com/v2/transcript"
|
||||||
|
assert provider.model == "universal-3-pro,universal-2"
|
||||||
|
|
||||||
|
custom = AssemblyAITranscriptionProvider(
|
||||||
|
api_key="aai-test",
|
||||||
|
api_base="https://assembly.example/v2",
|
||||||
|
model="universal-3-pro",
|
||||||
|
)
|
||||||
|
assert custom.upload_url == "https://assembly.example/v2/upload"
|
||||||
|
assert custom.transcript_url == "https://assembly.example/v2/transcript"
|
||||||
|
assert custom.model == "universal-3-pro"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_assemblyai_uploads_creates_and_polls(audio_file: Path) -> None:
|
||||||
|
provider = AssemblyAITranscriptionProvider(
|
||||||
|
api_key="aai-test",
|
||||||
|
api_base="https://assembly.example/v2",
|
||||||
|
language="en",
|
||||||
|
model="universal-3-pro,universal-2",
|
||||||
|
)
|
||||||
|
post = AsyncMock(
|
||||||
|
side_effect=[
|
||||||
|
_json_response(200, {"upload_url": "https://cdn.example/audio"}, url=provider.upload_url),
|
||||||
|
_json_response(200, {"id": "tr_123"}, url=provider.transcript_url),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
get = AsyncMock(
|
||||||
|
return_value=_json_response(
|
||||||
|
200,
|
||||||
|
{"status": "completed", "text": "assembly ok"},
|
||||||
|
method="GET",
|
||||||
|
url=f"{provider.transcript_url}/tr_123",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("httpx.AsyncClient.post", post), patch("httpx.AsyncClient.get", get), patch(
|
||||||
|
"asyncio.sleep", AsyncMock()
|
||||||
|
):
|
||||||
|
result = await provider.transcribe(audio_file)
|
||||||
|
|
||||||
|
assert result == "assembly ok"
|
||||||
|
assert post.await_count == 2
|
||||||
|
assert get.await_count == 1
|
||||||
|
upload_call, create_call = post.await_args_list
|
||||||
|
assert upload_call.args == ("https://assembly.example/v2/upload",)
|
||||||
|
assert upload_call.kwargs["headers"]["Authorization"] == "aai-test"
|
||||||
|
assert upload_call.kwargs["headers"]["Content-Type"] == "application/octet-stream"
|
||||||
|
assert upload_call.kwargs["content"] == audio_file.read_bytes()
|
||||||
|
assert create_call.args == ("https://assembly.example/v2/transcript",)
|
||||||
|
assert create_call.kwargs["json"] == {
|
||||||
|
"audio_url": "https://cdn.example/audio",
|
||||||
|
"speech_models": ["universal-3-pro", "universal-2"],
|
||||||
|
"language_code": "en",
|
||||||
|
}
|
||||||
|
assert get.await_args.args == ("https://assembly.example/v2/transcript/tr_123",)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_assemblyai_polls_until_completed(audio_file: Path) -> None:
|
||||||
|
provider = AssemblyAITranscriptionProvider(api_key="aai-test")
|
||||||
|
post = AsyncMock(
|
||||||
|
side_effect=[
|
||||||
|
_json_response(200, {"upload_url": "https://cdn.example/audio"}, url=provider.upload_url),
|
||||||
|
_json_response(200, {"id": "tr_123"}, url=provider.transcript_url),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
get = AsyncMock(
|
||||||
|
side_effect=[
|
||||||
|
_json_response(200, {"status": "processing"}, method="GET"),
|
||||||
|
_json_response(200, {"status": "completed", "text": "done"}, method="GET"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
sleep = AsyncMock()
|
||||||
|
|
||||||
|
with patch("httpx.AsyncClient.post", post), patch("httpx.AsyncClient.get", get), patch(
|
||||||
|
"asyncio.sleep", sleep
|
||||||
|
):
|
||||||
|
assert await provider.transcribe(audio_file) == "done"
|
||||||
|
|
||||||
|
assert get.await_count == 2
|
||||||
|
assert sleep.await_count == 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_assemblyai_returns_empty_on_failed_transcript(audio_file: Path) -> None:
|
||||||
|
provider = AssemblyAITranscriptionProvider(api_key="aai-test")
|
||||||
|
post = AsyncMock(
|
||||||
|
side_effect=[
|
||||||
|
_json_response(200, {"upload_url": "https://cdn.example/audio"}, url=provider.upload_url),
|
||||||
|
_json_response(200, {"id": "tr_123"}, url=provider.transcript_url),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
get = AsyncMock(
|
||||||
|
return_value=_json_response(
|
||||||
|
200,
|
||||||
|
{"status": "error", "error": "bad audio"},
|
||||||
|
method="GET",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("httpx.AsyncClient.post", post), patch("httpx.AsyncClient.get", get), patch(
|
||||||
|
"asyncio.sleep", AsyncMock()
|
||||||
|
):
|
||||||
|
assert await provider.transcribe(audio_file) == ""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_assemblyai_missing_api_key_short_circuits(audio_file: Path) -> None:
|
||||||
|
with patch.dict("os.environ", {}, clear=True):
|
||||||
|
provider = AssemblyAITranscriptionProvider(api_key=None)
|
||||||
|
post = AsyncMock()
|
||||||
|
with patch("httpx.AsyncClient.post", post):
|
||||||
|
assert await provider.transcribe(audio_file) == ""
|
||||||
|
assert post.await_count == 0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("status", [408, 429, 500, 502, 503, 504])
|
@pytest.mark.parametrize("status", [408, 429, 500, 502, 503, 504])
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_retries_on_every_advertised_transient_status(
|
async def test_retries_on_every_advertised_transient_status(
|
||||||
|
|||||||
@ -299,6 +299,50 @@ def test_settings_payload_exposes_xiaomi_mimo_transcription_provider(
|
|||||||
assert providers["xiaomi_mimo"]["configured"] is True
|
assert providers["xiaomi_mimo"]["configured"] is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_settings_payload_exposes_assemblyai_transcription_provider(
|
||||||
|
tmp_path,
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
) -> None:
|
||||||
|
config_path = tmp_path / "config.json"
|
||||||
|
config = Config()
|
||||||
|
config.transcription.provider = "assemblyai"
|
||||||
|
config.providers.assemblyai.api_key = "aai-test"
|
||||||
|
save_config(config, config_path)
|
||||||
|
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
|
||||||
|
|
||||||
|
payload = settings_payload()
|
||||||
|
|
||||||
|
assert payload["transcription"]["provider"] == "assemblyai"
|
||||||
|
assert payload["transcription"]["provider_configured"] is True
|
||||||
|
providers = {provider["name"]: provider for provider in payload["transcription"]["providers"]}
|
||||||
|
assert providers["assemblyai"]["label"] == "AssemblyAI"
|
||||||
|
assert providers["assemblyai"]["configured"] is True
|
||||||
|
assert providers["assemblyai"]["default_api_base"] == "https://api.assemblyai.com/v2"
|
||||||
|
provider_rows = {provider["name"]: provider for provider in payload["providers"]}
|
||||||
|
assert provider_rows["assemblyai"]["configured"] is True
|
||||||
|
assert provider_rows["assemblyai"]["model_selectable"] is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_model_configuration_rejects_transcription_only_provider(
|
||||||
|
tmp_path,
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
) -> None:
|
||||||
|
config_path = tmp_path / "config.json"
|
||||||
|
config = Config()
|
||||||
|
config.providers.assemblyai.api_key = "aai-test"
|
||||||
|
save_config(config, config_path)
|
||||||
|
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
|
||||||
|
|
||||||
|
with pytest.raises(WebUISettingsError, match="does not support chat models"):
|
||||||
|
create_model_configuration(
|
||||||
|
{
|
||||||
|
"label": ["Voice only"],
|
||||||
|
"provider": ["assemblyai"],
|
||||||
|
"model": ["universal-3-pro"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_update_transcription_settings_writes_top_level_only(
|
def test_update_transcription_settings_writes_top_level_only(
|
||||||
tmp_path,
|
tmp_path,
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
@ -385,6 +429,30 @@ def test_update_transcription_settings_accepts_xiaomi_mimo(
|
|||||||
assert payload["transcription"]["provider_configured"] is True
|
assert payload["transcription"]["provider_configured"] is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_update_transcription_settings_accepts_assemblyai(
|
||||||
|
tmp_path,
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
) -> None:
|
||||||
|
config_path = tmp_path / "config.json"
|
||||||
|
config = Config()
|
||||||
|
config.providers.assemblyai.api_key = "aai-test"
|
||||||
|
save_config(config, config_path)
|
||||||
|
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
|
||||||
|
|
||||||
|
payload = update_transcription_settings(
|
||||||
|
{
|
||||||
|
"provider": ["assemblyai"],
|
||||||
|
"model": ["universal-3-pro"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
saved = load_config(config_path)
|
||||||
|
assert saved.transcription.provider == "assemblyai"
|
||||||
|
assert saved.transcription.model == "universal-3-pro"
|
||||||
|
assert payload["transcription"]["provider"] == "assemblyai"
|
||||||
|
assert payload["transcription"]["provider_configured"] is True
|
||||||
|
|
||||||
|
|
||||||
def test_update_transcription_settings_validates_language(
|
def test_update_transcription_settings_validates_language(
|
||||||
tmp_path,
|
tmp_path,
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
|||||||
@ -779,7 +779,7 @@ export function SettingsView({
|
|||||||
const configuredModelProviderOptions = useMemo(
|
const configuredModelProviderOptions = useMemo(
|
||||||
() =>
|
() =>
|
||||||
settings?.providers
|
settings?.providers
|
||||||
.filter((provider) => provider.configured)
|
.filter((provider) => provider.configured && provider.model_selectable !== false)
|
||||||
.map((provider) => ({ name: provider.name, label: provider.label })) ?? [],
|
.map((provider) => ({ name: provider.name, label: provider.label })) ?? [],
|
||||||
[settings],
|
[settings],
|
||||||
);
|
);
|
||||||
|
|||||||
@ -113,6 +113,7 @@ const PROVIDER_BRANDS: Record<string, ProviderBrand> = {
|
|||||||
aihubmix: brand("aihubmix.com", "#111827", "AH"),
|
aihubmix: brand("aihubmix.com", "#111827", "AH"),
|
||||||
ant_ling: brand("ant-ling.com", "#7C3AED", "AL"),
|
ant_ling: brand("ant-ling.com", "#7C3AED", "AL"),
|
||||||
anthropic: brand("anthropic.com", "#D97757", "A"),
|
anthropic: brand("anthropic.com", "#D97757", "A"),
|
||||||
|
assemblyai: brand("assemblyai.com", "#111827", "AA"),
|
||||||
atomic_chat: brand("atomic.chat", "#111827", "AC"),
|
atomic_chat: brand("atomic.chat", "#111827", "AC"),
|
||||||
azure_openai: brand("azure.microsoft.com", "#0078D4", "AZ"),
|
azure_openai: brand("azure.microsoft.com", "#0078D4", "AZ"),
|
||||||
bedrock: brand("aws.amazon.com", "#FF9900", "AWS"),
|
bedrock: brand("aws.amazon.com", "#FF9900", "AWS"),
|
||||||
|
|||||||
@ -343,6 +343,7 @@ export interface SettingsPayload {
|
|||||||
api_key_hint?: string | null;
|
api_key_hint?: string | null;
|
||||||
api_base?: string | null;
|
api_base?: string | null;
|
||||||
default_api_base?: string | null;
|
default_api_base?: string | null;
|
||||||
|
model_selectable?: boolean;
|
||||||
api_type?: "auto" | "chat_completions" | "responses";
|
api_type?: "auto" | "chat_completions" | "responses";
|
||||||
oauth_account?: string | null;
|
oauth_account?: string | null;
|
||||||
oauth_expires_at?: number | null;
|
oauth_expires_at?: number | null;
|
||||||
|
|||||||
@ -47,4 +47,9 @@ describe("provider brand logos", () => {
|
|||||||
expect(providerBrand("openrouter")?.logoUrls).toContain("https://openrouter.ai/favicon.ico");
|
expect(providerBrand("openrouter")?.logoUrls).toContain("https://openrouter.ai/favicon.ico");
|
||||||
expect(providerBrand("openrouter")?.initials).toBe("OR");
|
expect(providerBrand("openrouter")?.initials).toBe("OR");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("keeps AssemblyAI voice settings on the first-party brand domain", () => {
|
||||||
|
expect(providerBrand("assemblyai")?.logoUrls).toContain("https://assemblyai.com/favicon.ico");
|
||||||
|
expect(providerBrand("assemblyai")?.initials).toBe("AA");
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user