diff --git a/desktop/package.json b/desktop/package.json index 83b816845..c961c8cf2 100644 --- a/desktop/package.json +++ b/desktop/package.json @@ -47,6 +47,9 @@ ], "mac": { "category": "public.app-category.developer-tools", + "extendInfo": { + "NSMicrophoneUsageDescription": "nanobot uses the microphone to transcribe voice input before you send messages." + }, "target": [ "dmg" ] diff --git a/desktop/src/main.ts b/desktop/src/main.ts index 8ace493c9..44c3336f0 100644 --- a/desktop/src/main.ts +++ b/desktop/src/main.ts @@ -15,6 +15,7 @@ import { protocol, session, shell, + systemPreferences, } from "electron"; import type { IpcMainInvokeEvent, WebContents } from "electron"; @@ -100,6 +101,58 @@ function isTrustedAppUrl(rawUrl: string): boolean { } } +function isTrustedPermissionRequest( + webContents: WebContents | null, + details: unknown, +): boolean { + return [ + permissionDetail(details, "requestingUrl"), + permissionDetail(details, "securityOrigin"), + webContents?.getURL(), + ].some((url) => typeof url === "string" && isTrustedAppUrl(url)); +} + +function permissionDetail(details: unknown, key: string): unknown { + return typeof details === "object" && details !== null + ? (details as Record)[key] + : undefined; +} + +function isAudioOnlyMediaRequest(details: unknown): boolean { + const mediaTypes = permissionDetail(details, "mediaTypes"); + if (Array.isArray(mediaTypes)) { + return mediaTypes.includes("audio") && !mediaTypes.includes("video"); + } + return permissionDetail(details, "mediaType") === "audio"; +} + +async function requestNativeMicrophoneAccess(): Promise { + if (process.platform !== "darwin") return true; + const status = systemPreferences.getMediaAccessStatus("microphone"); + if (status === "granted") return true; + if (status === "denied" || status === "restricted") return false; + return await systemPreferences.askForMediaAccess("microphone"); +} + +function registerPermissionHandlers(): void { + session.defaultSession.setPermissionCheckHandler((webContents, permission, _origin, details) => ( + permission === "media" + && isTrustedPermissionRequest(webContents, details) + && isAudioOnlyMediaRequest(details) + )); + session.defaultSession.setPermissionRequestHandler((webContents, permission, callback, details) => { + if ( + permission !== "media" + || !isTrustedPermissionRequest(webContents, details) + || !isAudioOnlyMediaRequest(details) + ) { + callback(false); + return; + } + void requestNativeMicrophoneAccess().then(callback, () => callback(false)); + }); +} + function assertTrustedIpc(event: IpcMainInvokeEvent): void { const frameUrl = event.senderFrame?.url || event.sender.getURL(); if (!isTrustedAppUrl(frameUrl)) { @@ -749,6 +802,7 @@ app.whenReady().then(async () => { } registerIpcHandlers(); + registerPermissionHandlers(); registerAppProtocol(webDist, devUrl); mainWindow = createWindow(); diff --git a/docs/channel-plugin-guide.md b/docs/channel-plugin-guide.md index da668c9ee..10ceb83b3 100644 --- a/docs/channel-plugin-guide.md +++ b/docs/channel-plugin-guide.md @@ -234,7 +234,7 @@ nanobot channels login --force # re-authenticate | `_handle_message(sender_id, chat_id, content, media?, metadata?, session_key?)` | **Call this when you receive a message.** Checks `is_allowed()`, then publishes to the bus. Automatically sets `_wants_stream` if `supports_streaming` is true. | | `is_allowed(sender_id)` | Checks against `config.allow_from`; `"*"` allows all, `[]` denies all. | | `default_config()` (classmethod) | Returns default config dict for `nanobot onboard`. Override to declare your fields. | -| `transcribe_audio(file_path)` | Transcribes audio via Groq Whisper (if configured). | +| `transcribe_audio(file_path)` | Transcribes audio via the shared top-level `transcription` config (if configured). | | `supports_streaming` (property) | `True` when config has `"streaming": true` **and** subclass overrides `send_delta()`. | | `is_running` | Returns `self._running`. | | `login(force=False)` | Perform interactive login (e.g. QR code scan). Returns `True` if already authenticated or login succeeds. Override in subclasses that support interactive login. | diff --git a/docs/configuration.md b/docs/configuration.md index 3a583a1a1..3ed500394 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -119,7 +119,7 @@ ANTHROPIC_API_KEY="$(bw get password api/anthropic)" nanobot agent ## Providers > [!TIP] -> - **Voice transcription**: Voice messages (Telegram, WhatsApp) are automatically transcribed using Whisper. By default Groq is used (free tier). Set `"transcriptionProvider": "openai"` under `channels` to use OpenAI Whisper instead, and optionally set `"transcriptionLanguage": "en"` (or another ISO-639-1 code) for more accurate transcription. The API key is picked from the matching provider config. +> - **Voice transcription**: Voice messages and WebUI/desktop microphone input use the shared top-level `transcription` settings. By default Groq Whisper is used; set `transcription.provider` to `"openai"` to use OpenAI Whisper. API keys still live in the matching `providers.` config. > - **MiniMax Coding Plan**: Exclusive discount links for the nanobot community: [Overseas](https://platform.minimax.io/subscribe/coding-plan?code=9txpdXw04g&source=link) · [Mainland China](https://platform.minimaxi.com/subscribe/token-plan?code=GILTJpMTqZ&source=link) > - **MiniMax (Mainland China)**: If your API key is from MiniMax's mainland China platform (minimaxi.com), set `"apiBase": "https://api.minimaxi.com/v1"` in your minimax provider config. > - **MiniMax thinking mode**: Use `providers.minimaxAnthropic` when you want `reasoningEffort` / thinking mode. MiniMax exposes that capability through its Anthropic-compatible endpoint, so nanobot keeps it as a separate provider instead of guessing MiniMax-specific thinking parameters on the generic OpenAI-compatible `minimax` endpoint. It uses the same `MINIMAX_API_KEY`. Default Anthropic-compatible base URL: `https://api.minimax.io/anthropic`; for mainland China use `https://api.minimaxi.com/anthropic`. @@ -1100,6 +1100,61 @@ Set `agents.defaults.modelPreset` to start with a named preset: When `modelPreset` is `null` or omitted, startup uses the implicit `default` preset from `agents.defaults.*`. Runtime changes made with `/model ` are not written back to `config.json`; they affect future turns until the process restarts or another model/config change replaces them. +## Transcription Settings + +Audio transcription is a shared capability used by chat-channel voice messages and by WebUI/desktop microphone input. Chat-channel voice messages are transcribed automatically before they enter the agent. WebUI and desktop microphone input is transcribed into the composer first, so you can edit the text before sending. + +Configure transcription under the top-level `transcription` section: + +```json +{ + "transcription": { + "enabled": true, + "provider": "groq", + "model": null, + "language": null, + "maxDurationSec": 120, + "maxUploadMb": 25 + } +} +``` + +| Setting | Default | Description | +|---------|---------|-------------| +| `enabled` | `true` | Enables audio transcription for both chat-channel voice messages and WebUI/desktop microphone input. | +| `provider` | `"groq"` | Transcription backend: `"groq"` or `"openai"`. | +| `model` | provider default | Optional transcription model override. Defaults to `whisper-large-v3` for Groq and `whisper-1` for OpenAI. | +| `language` | `null` | Optional ISO-639 language hint, e.g. `"en"`, `"zh"`, `"ko"`, or `"ja"`. | +| `maxDurationSec` | `120` | Maximum WebUI/desktop recording duration. | +| `maxUploadMb` | `25` | Maximum WebUI/desktop audio upload size. | + +Provider and language resolution is intentionally ordered for backwards compatibility: + +1. `transcription.provider` / `transcription.language` +2. Legacy `channels.transcriptionProvider` / `channels.transcriptionLanguage` +3. Built-in defaults (`provider: "groq"`, no language hint) + +The legacy `channels.*` transcription fields existed before transcription became a shared capability across chat channels and WebUI/desktop microphone input. They are still read so older `config.json` files keep working, but they are no longer the preferred configuration surface. If both old and new fields are present, the top-level `transcription` values are the source of truth. + +Transcription credentials are intentionally not stored in `transcription`. Put the API key and optional endpoint in the matching provider config: + +```json +{ + "providers": { + "groq": { + "apiKey": "gsk-...", + "apiBase": "https://api.groq.com/openai/v1" + } + }, + "transcription": { + "provider": "groq", + "language": "zh" + } +} +``` + +Selecting a transcription provider does not configure credentials by itself. For example, the effective provider may default to Groq for compatibility, but transcription is only usable when `providers.groq.apiKey` or the matching environment-backed config is available. The Settings UI writes only the top-level `transcription` fields. + ## Channel Settings Global settings that apply to all channels. Configure under the `channels` section in `~/.nanobot/config.json`: @@ -1111,8 +1166,6 @@ Global settings that apply to all channels. Configure under the `channels` secti "sendToolHints": false, "extractDocumentText": true, "sendMaxRetries": 3, - "transcriptionProvider": "groq", - "transcriptionLanguage": null, "telegram": { ... } } } @@ -1125,8 +1178,8 @@ Global settings that apply to all channels. Configure under the `channels` secti | `showReasoning` | `true` | Allow channels to surface model reasoning/thinking content (DeepSeek-R1 `reasoning_content`, Anthropic `thinking_blocks`, inline `` tags). Reasoning flows as a dedicated stream with `_reasoning_delta` / `_reasoning_end` markers — channels override `send_reasoning_delta` / `send_reasoning_end` to render in-place updates. Even with `true`, channels without those overrides stay no-op silently. Currently surfaced on CLI and WebSocket/WebUI (italic shimmer header, auto-collapses after the stream ends); Telegram / Slack / Discord / Feishu / WeChat / Matrix keep the base no-op until their bubble UI is adapted. Independent of `sendProgress`. | | `extractDocumentText` | `true` | Extract supported document/text attachments into the model prompt. Set to `false` to keep document content out of the prompt and include attachment path references instead. | | `sendMaxRetries` | `3` | Max delivery attempts per outbound message, including the initial send (0-10 configured, minimum 1 actual attempt) | -| `transcriptionProvider` | `"groq"` | Voice transcription backend: `"groq"` (free tier, default) or `"openai"`. API key and optional `apiBase` are auto-resolved from the matching provider config. Chat-style bases such as `https://api.groq.com/openai/v1` are normalized to the audio transcription endpoint. | -| `transcriptionLanguage` | `null` | Optional ISO-639-1 language hint for audio transcription, e.g. `"en"`, `"ko"`, `"ja"`. | + +`channels.transcriptionProvider` and `channels.transcriptionLanguage` are deprecated compatibility fields. They remain as a read-only fallback for older configs, but new configuration should use top-level `transcription.provider` and `transcription.language`. `sendProgress` and `sendToolHints` can also be overridden per channel. The global values stay as defaults for channels that do not set their own value: diff --git a/nanobot/agent/tools/exec_session.py b/nanobot/agent/tools/exec_session.py index a1d84827c..b0d79978b 100644 --- a/nanobot/agent/tools/exec_session.py +++ b/nanobot/agent/tools/exec_session.py @@ -24,6 +24,7 @@ DEFAULT_WAIT_FOR_MS = 10_000 MAX_WAIT_FOR_MS = 120_000 DEFAULT_MAX_OUTPUT_CHARS = 10_000 MAX_OUTPUT_CHARS = 50_000 +OUTPUT_DRAIN_GRACE_S = 0.1 @dataclass(slots=True) @@ -139,6 +140,8 @@ class _ExecSession: asyncio.gather(self._stdout_task, self._stderr_task), timeout=2.0, ) + elif yield_time_ms > 0: + await self._wait_for_buffered_output() async with self._lock: output = "".join(self._chunks) @@ -163,6 +166,14 @@ class _ExecSession: with suppress(asyncio.TimeoutError): await asyncio.wait_for(self.process.wait(), timeout=5.0) + async def _wait_for_buffered_output(self) -> None: + deadline = time.monotonic() + OUTPUT_DRAIN_GRACE_S + while time.monotonic() < deadline: + async with self._lock: + if self._chunks: + return + await asyncio.sleep(0.01) + class ExecSessionManager: def __init__(self, *, max_sessions: int = 8, idle_timeout: int = 1800) -> None: diff --git a/nanobot/audio/__init__.py b/nanobot/audio/__init__.py new file mode 100644 index 000000000..2e21f694d --- /dev/null +++ b/nanobot/audio/__init__.py @@ -0,0 +1,2 @@ +"""Shared audio service helpers.""" + diff --git a/nanobot/audio/transcription.py b/nanobot/audio/transcription.py new file mode 100644 index 000000000..d27094f3c --- /dev/null +++ b/nanobot/audio/transcription.py @@ -0,0 +1,183 @@ +"""Application-level audio transcription service. + +This module owns nanobot's transcription behavior: config resolution, +legacy channel fallback, upload validation, temporary-file handling, and +dispatch to provider adapters. It deliberately does not know provider-specific +HTTP details; those live in ``nanobot.providers.transcription``. +""" + +from __future__ import annotations + +from contextlib import suppress +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Literal + +from loguru import logger + +from nanobot.config.paths import get_media_dir +from nanobot.utils.media_decode import FileSizeExceeded, save_base64_data_url + +TranscriptionProviderName = Literal["groq", "openai"] + +_DEFAULT_PROVIDER: TranscriptionProviderName = "groq" +_DEFAULT_MODELS: dict[TranscriptionProviderName, str] = { + "groq": "whisper-large-v3", + "openai": "whisper-1", +} +_MAX_AUDIO_BYTES_FALLBACK = 25 * 1024 * 1024 +_AUDIO_MIME_ALLOWED: frozenset[str] = frozenset({ + "audio/aac", + "audio/flac", + "audio/m4a", + "audio/mp4", + "audio/mpeg", + "audio/ogg", + "audio/wav", + "audio/webm", + "audio/x-m4a", + "audio/x-wav", +}) + + +@dataclass(frozen=True) +class EffectiveTranscriptionConfig: + enabled: bool + provider: TranscriptionProviderName + model: str + language: str | None + api_key: str = field(repr=False) + api_base: str + max_duration_sec: int + max_upload_mb: int + + @property + def configured(self) -> bool: + return bool(self.api_key) + + +class TranscriptionIngressError(Exception): + """Stable transcription upload error surfaced to WebUI clients.""" + + def __init__(self, detail: str, **extra: Any): + super().__init__(detail) + self.detail = detail + self.extra = extra + + +def _as_provider(value: Any) -> TranscriptionProviderName | None: + if isinstance(value, str): + name = value.strip().lower() + if name in _DEFAULT_MODELS: + return name # type: ignore[return-value] + return None + + +def _provider_config(config: Any, provider: str) -> Any: + return getattr(getattr(config, "providers", None), provider, None) + + +def _extract_data_url_mime(url: str) -> str | None: + header, _, _ = url.partition(",") + if not header.startswith("data:") or ";base64" not in header: + return None + return header[5:].split(";", 1)[0].strip().lower() or None + + +def resolve_transcription_config(config: Any) -> EffectiveTranscriptionConfig: + """Resolve top-level transcription settings with legacy channel fallback.""" + top = getattr(config, "transcription", None) + channels = getattr(config, "channels", None) + provider = ( + _as_provider(getattr(top, "provider", None)) + or _as_provider(getattr(channels, "transcription_provider", None)) + or _DEFAULT_PROVIDER + ) + provider_cfg = _provider_config(config, provider) + return EffectiveTranscriptionConfig( + enabled=bool(getattr(top, "enabled", True)), + provider=provider, + model=(getattr(top, "model", None) or _DEFAULT_MODELS[provider]).strip(), + language=getattr(top, "language", None) or getattr(channels, "transcription_language", None), + api_key=getattr(provider_cfg, "api_key", None) or "", + api_base=getattr(provider_cfg, "api_base", None) or "", + max_duration_sec=int(getattr(top, "max_duration_sec", 120)), + max_upload_mb=int(getattr(top, "max_upload_mb", 25)), + ) + + +async def transcribe_audio_data_url( + data_url: Any, + config: EffectiveTranscriptionConfig, + *, + duration_ms: Any = None, +) -> str: + """Validate, persist, transcribe, and remove a WebUI audio data URL.""" + if not isinstance(data_url, str) or not data_url: + raise TranscriptionIngressError("missing_audio") + if not config.enabled: + raise TranscriptionIngressError("disabled") + if not config.configured: + raise TranscriptionIngressError("not_configured", provider=config.provider) + if ( + isinstance(duration_ms, (int, float)) + and duration_ms > (config.max_duration_sec * 1000 + 1000) + ): + raise TranscriptionIngressError("duration") + if _extract_data_url_mime(data_url) not in _AUDIO_MIME_ALLOWED: + raise TranscriptionIngressError("mime") + + audio_path: str | None = None + max_bytes = max( + 1, + config.max_upload_mb * 1024 * 1024 if config.max_upload_mb else _MAX_AUDIO_BYTES_FALLBACK, + ) + try: + audio_path = save_base64_data_url( + data_url, + get_media_dir("webui-transcription"), + max_bytes=max_bytes, + ) + except FileSizeExceeded as exc: + raise TranscriptionIngressError("size") from exc + except Exception as exc: + logger.warning("transcription audio decode failed: {}", exc) + if not audio_path: + raise TranscriptionIngressError("decode") + + try: + text = await transcribe_audio_file(audio_path, config) + finally: + with suppress(OSError): + Path(audio_path).unlink(missing_ok=True) + if not text: + raise TranscriptionIngressError("empty") + return text + + +async def transcribe_audio_file( + file_path: str | Path, + config: EffectiveTranscriptionConfig, +) -> str: + """Transcribe *file_path* using the already-resolved transcription config.""" + if not config.enabled or not config.configured: + return "" + if config.provider == "openai": + from nanobot.providers.transcription import OpenAITranscriptionProvider + + provider = OpenAITranscriptionProvider( + api_key=config.api_key, + api_base=config.api_base or None, + language=config.language, + model=config.model, + ) + else: + from nanobot.providers.transcription import GroqTranscriptionProvider + + provider = GroqTranscriptionProvider( + api_key=config.api_key, + api_base=config.api_base or None, + language=config.language, + model=config.model, + ) + return await provider.transcribe(file_path) diff --git a/nanobot/channels/base.py b/nanobot/channels/base.py index f9d7bdd19..37fff8a49 100644 --- a/nanobot/channels/base.py +++ b/nanobot/channels/base.py @@ -28,10 +28,6 @@ class BaseChannel(ABC): name: str = "base" display_name: str = "Base" - transcription_provider: str = "groq" - transcription_api_key: str = "" - transcription_api_base: str = "" - transcription_language: str | None = None send_progress: bool = True send_tool_hints: bool = False show_reasoning: bool = True @@ -51,24 +47,14 @@ class BaseChannel(ABC): async def transcribe_audio(self, file_path: str | Path) -> str: """Transcribe an audio file via Whisper (OpenAI or Groq). Returns empty string on failure.""" - if not self.transcription_api_key: - return "" try: - if self.transcription_provider == "openai": - from nanobot.providers.transcription import OpenAITranscriptionProvider - provider = OpenAITranscriptionProvider( - api_key=self.transcription_api_key, - api_base=self.transcription_api_base or None, - language=self.transcription_language or None, - ) - else: - from nanobot.providers.transcription import GroqTranscriptionProvider - provider = GroqTranscriptionProvider( - api_key=self.transcription_api_key, - api_base=self.transcription_api_base or None, - language=self.transcription_language or None, - ) - return await provider.transcribe(file_path) + from nanobot.audio.transcription import ( + resolve_transcription_config, + transcribe_audio_file, + ) + from nanobot.config.loader import load_config + + return await transcribe_audio_file(file_path, resolve_transcription_config(load_config())) except Exception: self.logger.exception("Audio transcription failed") return "" diff --git a/nanobot/channels/manager.py b/nanobot/channels/manager.py index ffa5cca67..b59925232 100644 --- a/nanobot/channels/manager.py +++ b/nanobot/channels/manager.py @@ -80,11 +80,6 @@ class ChannelManager: """Initialize channels discovered via pkgutil scan + entry_points plugins.""" from nanobot.channels.registry import discover_channel_names, discover_enabled - transcription_provider = self.config.channels.transcription_provider - transcription_key = self._resolve_transcription_key(transcription_provider) - transcription_base = self._resolve_transcription_base(transcription_provider) - transcription_language = self.config.channels.transcription_language - # Collect enabled module names first, then only import those. # Channel configs live in ChannelsConfig's extra fields (via # extra="allow"), so we enumerate candidates from pkgutil scan @@ -135,10 +130,6 @@ class ChannelManager: ) kwargs["gateway"] = gateway channel = cls(section, self.bus, **kwargs) - channel.transcription_provider = transcription_provider - channel.transcription_api_key = transcription_key - channel.transcription_api_base = transcription_base - channel.transcription_language = transcription_language channel.send_progress = self._resolve_bool_override( section, "send_progress", self.config.channels.send_progress, ) @@ -155,24 +146,6 @@ class ChannelManager: self._validate_allow_from() - def _resolve_transcription_key(self, provider: str) -> str: - """Pick the API key for the configured transcription provider.""" - try: - if provider == "openai": - return self.config.providers.openai.api_key - return self.config.providers.groq.api_key - except AttributeError: - return "" - - def _resolve_transcription_base(self, provider: str) -> str: - """Pick the API base URL for the configured transcription provider.""" - try: - if provider == "openai": - return self.config.providers.openai.api_base or "" - return self.config.providers.groq.api_base or "" - except AttributeError: - return "" - def _validate_allow_from(self) -> None: for name, ch in self.channels.items(): cfg = ch.config diff --git a/nanobot/channels/websocket.py b/nanobot/channels/websocket.py index 8675b6252..b3f58d982 100644 --- a/nanobot/channels/websocket.py +++ b/nanobot/channels/websocket.py @@ -45,6 +45,7 @@ from nanobot.webui.http_utils import ( query_first as _query_first, ) from nanobot.webui.mcp_presets_api import normalize_mcp_preset_mentions +from nanobot.webui.transcription_ws import webui_transcription_event from nanobot.webui.websocket_logging import websockets_server_logger @@ -235,7 +236,7 @@ _VIDEO_MIME_ALLOWED: frozenset[str] = frozenset({ _UPLOAD_MIME_ALLOWED: frozenset[str] = _IMAGE_MIME_ALLOWED | _VIDEO_MIME_ALLOWED -_DATA_URL_MIME_RE = re.compile(r"^data:([^;]+);base64,", re.DOTALL) +_DATA_URL_MIME_RE = re.compile(r"^data:([^;,]+)(?:;[^,]*)*;base64,", re.DOTALL) def _extract_data_url_mime(url: str) -> str | None: @@ -419,7 +420,6 @@ class WebSocketChannel(BaseChannel): return None # -- Server lifecycle and connection ingress --------------------------- - # -- Server lifecycle and connection ingress --------------------------- async def start(self) -> None: from nanobot.utils.logging_bridge import redirect_lib_logging @@ -703,6 +703,10 @@ class WebSocketChannel(BaseChannel): workspace_scope=scope.payload(), ) return + if t == "transcribe_audio": + event, payload = await webui_transcription_event(envelope) + await self._send_event(connection, event, **payload) + return if t == "message": cid = envelope.get("chat_id") content = envelope.get("content") diff --git a/nanobot/config/schema.py b/nanobot/config/schema.py index b9ebbd7ed..1ca13c4f2 100644 --- a/nanobot/config/schema.py +++ b/nanobot/config/schema.py @@ -39,8 +39,19 @@ class ChannelsConfig(Base): show_reasoning: bool = True # surface model reasoning when channel implements it extract_document_text: bool = True # extract text from document attachments before sending to the model send_max_retries: int = Field(default=3, ge=0, le=10) # Max delivery attempts (initial send included) - transcription_provider: str = "groq" # Voice transcription backend: "groq" or "openai" - transcription_language: str | None = Field(default=None, pattern=r"^[a-z]{2,3}$") # Optional ISO-639-1 hint for audio transcription + transcription_provider: str = "groq" # Deprecated: use top-level transcription.provider + transcription_language: str | None = Field(default=None, pattern=r"^[a-z]{2,3}$") # Deprecated: use top-level transcription.language + + +class TranscriptionConfig(Base): + """Cross-channel audio transcription configuration.""" + + enabled: bool = True + provider: Literal["groq", "openai"] | None = None + model: str | None = None + language: str | None = Field(default=None, pattern=r"^[a-z]{2,3}$") + max_duration_sec: int = Field(default=120, ge=1, le=600) + max_upload_mb: int = Field(default=25, ge=1, le=100) class DreamConfig(Base): @@ -167,7 +178,7 @@ class AgentsConfig(Base): class ProviderConfig(Base): """LLM provider configuration.""" - api_key: str | None = None + api_key: str | None = Field(default=None, repr=False) api_base: str | None = None api_type: Literal["auto", "chat_completions", "responses"] = "auto" # Request API surface extra_headers: dict[str, str] | None = None # Custom headers (e.g. APP-Code for AiHubMix) @@ -312,6 +323,7 @@ class Config(BaseSettings): agents: AgentsConfig = Field(default_factory=AgentsConfig) channels: ChannelsConfig = Field(default_factory=ChannelsConfig) + transcription: TranscriptionConfig = Field(default_factory=TranscriptionConfig) providers: ProvidersConfig = Field(default_factory=ProvidersConfig) api: ApiConfig = Field(default_factory=ApiConfig) gateway: GatewayConfig = Field(default_factory=GatewayConfig) diff --git a/nanobot/providers/transcription.py b/nanobot/providers/transcription.py index 8a21d29a2..4af95c4a7 100644 --- a/nanobot/providers/transcription.py +++ b/nanobot/providers/transcription.py @@ -1,6 +1,12 @@ -"""Voice transcription providers (Groq and OpenAI Whisper).""" +"""Provider-specific voice transcription adapters. + +This module only knows how to call external transcription APIs such as Groq +and OpenAI Whisper. Product-level config fallback, WebUI upload validation, +and channel integration live in ``nanobot.audio.transcription``. +""" import asyncio +import mimetypes import os from pathlib import Path @@ -8,6 +14,15 @@ import httpx from loguru import logger _TRANSCRIPTIONS_PATH = "audio/transcriptions" +_AUDIO_MIME_OVERRIDES = { + ".m4a": "audio/mp4", + ".mpga": "audio/mpeg", + ".ogg": "audio/ogg", + ".opus": "audio/ogg", + ".wav": "audio/wav", + ".weba": "audio/webm", + ".webm": "audio/webm", +} def _resolve_transcription_url(api_base: str | None, default_url: str) -> str: @@ -26,6 +41,14 @@ def _resolve_transcription_url(api_base: str | None, default_url: str) -> str: return f"{base}/{_TRANSCRIPTIONS_PATH}" +def _audio_mime_type(path: Path) -> str: + return ( + _AUDIO_MIME_OVERRIDES.get(path.suffix.lower()) + or mimetypes.guess_type(path.name)[0] + or "application/octet-stream" + ) + + # Up to 3 retries (4 attempts total) with exponential backoff on transient # failures. Whisper endpoints occasionally return 502/503 under load, and # mobile-network transcription callers hit sporadic connect/read errors. @@ -71,7 +94,7 @@ async def _post_transcription_with_retry( async with httpx.AsyncClient() as client: for attempt in range(_MAX_RETRIES + 1): files = { - "file": (path.name, data), + "file": (path.name, data, _audio_mime_type(path)), "model": (None, model), } if language: @@ -113,6 +136,16 @@ async def _post_transcription_with_retry( try: response.raise_for_status() + except httpx.HTTPStatusError: + body = response.text.strip().replace("\n", " ")[:500] + logger.error( + "{} transcription HTTP {}{}{}", + provider_label, + response.status_code, + f" {response.reason_phrase}" if response.reason_phrase else "", + f": {body}" if body else "", + ) + return "" except Exception as e: logger.exception("{} transcription error: {}", provider_label, e) return "" @@ -144,6 +177,7 @@ class OpenAITranscriptionProvider: api_key: str | None = None, api_base: str | None = None, language: str | None = None, + model: str | None = None, ): self.api_key = api_key or os.environ.get("OPENAI_API_KEY") self.api_url = _resolve_transcription_url( @@ -151,6 +185,7 @@ class OpenAITranscriptionProvider: "https://api.openai.com/v1/audio/transcriptions", ) self.language = language or None + self.model = model or "whisper-1" logger.debug("OpenAI transcription endpoint: {}", self.api_url) async def transcribe(self, file_path: str | Path) -> str: @@ -165,7 +200,7 @@ class OpenAITranscriptionProvider: self.api_url, api_key=self.api_key, path=path, - model="whisper-1", + model=self.model, provider_label="OpenAI", language=self.language, ) @@ -183,6 +218,7 @@ class GroqTranscriptionProvider: api_key: str | None = None, api_base: str | None = None, language: str | None = None, + model: str | None = None, ): self.api_key = api_key or os.environ.get("GROQ_API_KEY") self.api_url = _resolve_transcription_url( @@ -190,6 +226,7 @@ class GroqTranscriptionProvider: "https://api.groq.com/openai/v1/audio/transcriptions", ) self.language = language or None + self.model = model or "whisper-large-v3" logger.debug("Groq transcription endpoint: {}", self.api_url) async def transcribe(self, file_path: str | Path) -> str: @@ -215,7 +252,7 @@ class GroqTranscriptionProvider: self.api_url, api_key=self.api_key, path=path, - model="whisper-large-v3", + model=self.model, provider_label="Groq", language=self.language, ) diff --git a/nanobot/utils/media_decode.py b/nanobot/utils/media_decode.py index 484613d97..0c1682e72 100644 --- a/nanobot/utils/media_decode.py +++ b/nanobot/utils/media_decode.py @@ -18,13 +18,30 @@ from nanobot.utils.helpers import safe_filename DEFAULT_MAX_BYTES = 10 * 1024 * 1024 MAX_FILE_SIZE = DEFAULT_MAX_BYTES -_DATA_URL_RE = re.compile(r"^data:([^;]+);base64,(.+)$", re.DOTALL) +_DATA_URL_RE = re.compile(r"^data:([^;,]+)(?:;[^,]*)*;base64,(.+)$", re.DOTALL) +_MIME_EXTENSION_OVERRIDES = { + # Python's ``mimetypes`` maps browser-recorded audio/webm to ``.weba`` and + # audio/ogg to ``.oga`` on macOS. Some transcription APIs validate by the + # file extension and accept the canonical container extensions instead. + "application/ogg": ".ogg", + "audio/ogg": ".ogg", + "audio/mpga": ".mpga", + "audio/wav": ".wav", + "audio/webm": ".webm", + "audio/x-m4a": ".m4a", + "audio/x-wav": ".wav", + "audio/vnd.wave": ".wav", + "video/webm": ".webm", +} -class FileSizeExceeded(Exception): +class FileSizeExceededError(Exception): """Raised when a decoded payload exceeds the caller's size limit.""" +FileSizeExceeded = FileSizeExceededError + + def save_base64_data_url( data_url: str, media_dir: Path, @@ -40,7 +57,7 @@ def save_base64_data_url( m = _DATA_URL_RE.match(data_url) if not m: return None - mime_type, b64_payload = m.group(1), m.group(2) + mime_type, b64_payload = m.group(1).strip().lower(), m.group(2) try: raw = base64.b64decode(b64_payload) except Exception: @@ -48,7 +65,7 @@ def save_base64_data_url( limit = DEFAULT_MAX_BYTES if max_bytes is None else max_bytes if len(raw) > limit: raise FileSizeExceeded(f"File exceeds {limit // (1024 * 1024)}MB limit") - ext = mimetypes.guess_extension(mime_type) or ".bin" + ext = _MIME_EXTENSION_OVERRIDES.get(mime_type) or mimetypes.guess_extension(mime_type) or ".bin" filename = f"{uuid.uuid4().hex[:12]}{ext}" dest = media_dir / safe_filename(filename) dest.write_bytes(raw) diff --git a/nanobot/webui/settings_api.py b/nanobot/webui/settings_api.py index 3f3df3957..3b90fe081 100644 --- a/nanobot/webui/settings_api.py +++ b/nanobot/webui/settings_api.py @@ -15,6 +15,7 @@ from zoneinfo import ZoneInfo import httpx +from nanobot.audio.transcription import resolve_transcription_config from nanobot.config.loader import get_config_path, load_config, save_config from nanobot.config.schema import ModelPresetConfig from nanobot.providers.image_generation import ( @@ -90,6 +91,7 @@ _IMAGE_GENERATION_ASPECT_RATIOS = { "2:3", "21:9", } +_TRANSCRIPTION_PROVIDERS = ("groq", "openai") _CONTEXT_WINDOW_TOKEN_OPTIONS = {65_536, 262_144} _MODEL_CONFIGURATION_SLUG_RE = re.compile(r"[^a-z0-9_-]+") _ENV_REF_RE = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}") @@ -576,6 +578,22 @@ def _image_generation_provider_rows(config: Any) -> list[dict[str, Any]]: return rows +def _transcription_provider_rows(config: Any) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + for name in _TRANSCRIPTION_PROVIDERS: + spec = find_by_name(name) + provider_config = getattr(config.providers, name, None) + rows.append({ + "name": name, + "label": spec.label if spec is not None else name, + "configured": bool(getattr(provider_config, "api_key", None)), + "api_key_hint": _mask_secret_hint(getattr(provider_config, "api_key", None)), + "api_base": getattr(provider_config, "api_base", None), + "default_api_base": spec.default_api_base if spec and spec.default_api_base else None, + }) + return rows + + def settings_payload( *, requires_restart: bool = False, @@ -633,6 +651,7 @@ def settings_payload( search_config = config.tools.web.search image_config = config.tools.image_generation + transcription = resolve_transcription_config(config) search_provider = ( search_config.provider if search_config.provider in _WEB_SEARCH_PROVIDER_BY_NAME @@ -733,6 +752,16 @@ def settings_payload( "save_dir": image_config.save_dir, "providers": image_providers, }, + "transcription": { + "enabled": transcription.enabled, + "provider": transcription.provider, + "provider_configured": transcription.configured, + "model": transcription.model, + "language": transcription.language, + "max_duration_sec": transcription.max_duration_sec, + "max_upload_mb": transcription.max_upload_mb, + "providers": _transcription_provider_rows(config), + }, "runtime": { "config_path": str(get_config_path().expanduser()), "workspace_path": str(config.workspace_path), @@ -1311,3 +1340,71 @@ def update_image_generation_settings(query: QueryParams) -> dict[str, Any]: if changed: save_config(config) return settings_payload(requires_restart=changed) + + +def update_transcription_settings(query: QueryParams) -> dict[str, Any]: + config = load_config() + transcription = config.transcription + changed = False + + enabled = _query_first(query, "enabled") + if enabled is not None: + parsed_enabled = _parse_bool(enabled, "enabled") + if transcription.enabled != parsed_enabled: + transcription.enabled = parsed_enabled + changed = True + + provider = _query_first(query, "provider") + if provider is not None: + provider = provider.strip().lower() + if provider not in _TRANSCRIPTION_PROVIDERS: + raise WebUISettingsError("unknown transcription provider") + if transcription.provider != provider: + transcription.provider = provider # type: ignore[assignment] + changed = True + + model = _query_first(query, "model") + if model is not None: + model = model.strip() or None + if model is not None and len(model) > 200: + raise WebUISettingsError("transcription model is too long") + if transcription.model != model: + transcription.model = model + changed = True + + language = _query_first(query, "language") + if language is not None: + language = language.strip().lower() or None + if language is not None and not re.fullmatch(r"[a-z]{2,3}", language): + raise WebUISettingsError("transcription language must be 2-3 lowercase letters") + if transcription.language != language: + transcription.language = language + changed = True + + max_duration_sec = _query_first_alias(query, "max_duration_sec", "maxDurationSec") + if max_duration_sec is not None: + try: + parsed_duration = int(max_duration_sec) + except ValueError: + raise WebUISettingsError("max_duration_sec must be an integer") from None + if parsed_duration < 1 or parsed_duration > 600: + raise WebUISettingsError("max_duration_sec must be between 1 and 600") + if transcription.max_duration_sec != parsed_duration: + transcription.max_duration_sec = parsed_duration + changed = True + + max_upload_mb = _query_first_alias(query, "max_upload_mb", "maxUploadMb") + if max_upload_mb is not None: + try: + parsed_upload = int(max_upload_mb) + except ValueError: + raise WebUISettingsError("max_upload_mb must be an integer") from None + if parsed_upload < 1 or parsed_upload > 100: + raise WebUISettingsError("max_upload_mb must be between 1 and 100") + if transcription.max_upload_mb != parsed_upload: + transcription.max_upload_mb = parsed_upload + changed = True + + if changed: + save_config(config) + return settings_payload() diff --git a/nanobot/webui/settings_routes.py b/nanobot/webui/settings_routes.py index ff5b7d7df..b8dbb4b73 100644 --- a/nanobot/webui/settings_routes.py +++ b/nanobot/webui/settings_routes.py @@ -33,6 +33,7 @@ from nanobot.webui.settings_api import ( update_model_configuration, update_network_safety_settings, update_provider_settings, + update_transcription_settings, update_web_search_settings, ) @@ -100,6 +101,8 @@ class WebUISettingsRouter: return self._handle_settings_web_search_update(request) if path == "/api/settings/image-generation/update": return self._handle_settings_image_generation_update(request) + if path == "/api/settings/transcription/update": + return self._handle_settings_transcription_update(request) if path == "/api/settings/network-safety/update": return self._handle_settings_network_safety_update(request) if path == "/api/settings/cli-apps": @@ -275,6 +278,15 @@ class WebUISettingsRouter: return self._error_response(e.status, e.message) return self._json_response(self._with_restart_state(payload, section="image")) + def _handle_settings_transcription_update(self, request: WsRequest) -> Response: + if not self._authorized(request): + return self._unauthorized() + try: + payload = update_transcription_settings(self._query(request)) + except WebUISettingsError as e: + return self._error_response(e.status, e.message) + return self._json_response(self._with_restart_state(payload)) + def _handle_settings_network_safety_update(self, request: WsRequest) -> Response: if not self._authorized(request): return self._unauthorized() diff --git a/nanobot/webui/transcription_ws.py b/nanobot/webui/transcription_ws.py new file mode 100644 index 000000000..8404206e1 --- /dev/null +++ b/nanobot/webui/transcription_ws.py @@ -0,0 +1,46 @@ +"""WebUI transcription envelope handling. + +The WebSocket channel owns transport and subscription fan-out. This module owns +the WebUI-specific audio transcription action carried over that socket. +""" + +from __future__ import annotations + +from typing import Any + +from nanobot.audio.transcription import ( + TranscriptionIngressError, + resolve_transcription_config, + transcribe_audio_data_url, +) +from nanobot.config.loader import load_config + +_MAX_REQUEST_ID_LENGTH = 80 + + +async def webui_transcription_event(envelope: dict[str, Any]) -> tuple[str, dict[str, Any]]: + """Return the WS event name and payload for one WebUI transcription request.""" + request_id = envelope.get("request_id") + valid_request_id = ( + isinstance(request_id, str) + and 0 < len(request_id) <= _MAX_REQUEST_ID_LENGTH + ) + + def error(detail: str, **extra: Any) -> tuple[str, dict[str, Any]]: + payload: dict[str, Any] = {"detail": detail, **extra} + if valid_request_id: + payload["request_id"] = request_id + return "transcription_error", payload + + if not valid_request_id: + return error("invalid_request") + + try: + text = await transcribe_audio_data_url( + envelope.get("data_url"), + resolve_transcription_config(load_config()), + duration_ms=envelope.get("duration_ms"), + ) + except TranscriptionIngressError as exc: + return error(exc.detail, **exc.extra) + return "transcription_result", {"request_id": request_id, "text": text} diff --git a/tests/channels/test_channel_plugins.py b/tests/channels/test_channel_plugins.py index d29dfe4ff..f881cebba 100644 --- a/tests/channels/test_channel_plugins.py +++ b/tests/channels/test_channel_plugins.py @@ -12,7 +12,8 @@ from nanobot.bus.events import OutboundMessage from nanobot.bus.queue import MessageBus from nanobot.channels.base import BaseChannel from nanobot.channels.manager import ChannelManager -from nanobot.config.schema import ChannelsConfig +from nanobot.config.loader import save_config +from nanobot.config.schema import ChannelsConfig, Config from nanobot.providers.transcription import GroqTranscriptionProvider as _GroqProvider from nanobot.providers.transcription import OpenAITranscriptionProvider as _OpenAIProvider from nanobot.utils.restart import RestartNotice @@ -238,102 +239,103 @@ async def test_manager_loads_plugin_from_dict_config(): @pytest.mark.asyncio -async def test_manager_propagates_groq_transcription_api_base_to_channels(): - from nanobot.channels.manager import ChannelManager - - fake_config = SimpleNamespace( - channels=ChannelsConfig.model_validate({ - "fakeplugin": {"enabled": True, "allowFrom": ["*"]}, - "transcriptionLanguage": "en", - }), - providers=SimpleNamespace( - groq=SimpleNamespace(api_key="groq-key", api_base="http://proxy.local/v1/audio/transcriptions"), - openai=SimpleNamespace(api_key="openai-key", api_base="https://api.openai.com/v1/audio/transcriptions"), - ), - ) - - with patch( - "nanobot.channels.registry.discover_enabled", - return_value={"fakeplugin": _FakePlugin}, - ): - mgr = ChannelManager.__new__(ChannelManager) - mgr.config = fake_config - mgr.bus = MessageBus() - mgr.channels = {} - mgr._dispatch_task = None - mgr._init_channels() - - channel = mgr.channels["fakeplugin"] - assert channel.transcription_provider == "groq" - assert channel.transcription_api_key == "groq-key" - assert channel.transcription_api_base == "http://proxy.local/v1/audio/transcriptions" - assert channel.transcription_language == "en" - - -@pytest.mark.asyncio -async def test_manager_propagates_openai_transcription_api_base_to_channels(): - from nanobot.channels.manager import ChannelManager - - fake_config = SimpleNamespace( - channels=ChannelsConfig.model_validate({ - "fakeplugin": {"enabled": True, "allowFrom": ["*"]}, - "transcriptionProvider": "openai", - }), - providers=SimpleNamespace( - openai=SimpleNamespace( - api_key="openai-key", - api_base="http://proxy.local/v1/audio/transcriptions", - ), - groq=SimpleNamespace(api_key="groq-key", api_base=""), - ), - ) - - with patch( - "nanobot.channels.registry.discover_enabled", - return_value={"fakeplugin": _FakePlugin}, - ): - mgr = ChannelManager.__new__(ChannelManager) - mgr.config = fake_config - mgr.bus = MessageBus() - mgr.channels = {} - mgr._dispatch_task = None - mgr._init_channels() - - channel = mgr.channels["fakeplugin"] - assert channel.transcription_provider == "openai" - assert channel.transcription_api_key == "openai-key" - assert channel.transcription_api_base == "http://proxy.local/v1/audio/transcriptions" - - -@pytest.mark.asyncio -async def test_base_channel_passes_api_base_to_openai_transcription_provider(): - """BaseChannel.transcribe_audio must forward transcription_api_base to OpenAI.""" +async def test_base_channel_reads_current_transcription_config_each_call( + tmp_path, + monkeypatch: pytest.MonkeyPatch, +): + """BaseChannel.transcribe_audio resolves config at call time, not manager init time.""" from nanobot.providers import transcription as transcription_mod - channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus()) - channel.transcription_provider = "openai" - channel.transcription_api_key = "k" - channel.transcription_api_base = "http://override/v1/audio/transcriptions" - channel.transcription_language = "en" + config_path = tmp_path / "config.json" + config = Config() + config.transcription.provider = "openai" + config.transcription.model = "whisper-custom" + config.transcription.language = "en" + config.providers.openai.api_key = "openai-key" + config.providers.openai.api_base = "http://openai.local/v1/audio/transcriptions" + save_config(config, config_path) + monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path) - captured: dict[str, object] = {} + channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus()) + + calls: list[dict[str, object]] = [] class _StubOpenAI: - def __init__(self, api_key=None, api_base=None, language=None): - captured["api_key"] = api_key - captured["api_base"] = api_base - captured["language"] = language + def __init__(self, api_key=None, api_base=None, language=None, model=None): + calls.append({ + "provider": "openai", + "api_key": api_key, + "api_base": api_base, + "language": language, + "model": model, + }) async def transcribe(self, file_path): - return "ok" + return "openai-ok" - with patch.object(transcription_mod, "OpenAITranscriptionProvider", _StubOpenAI): - result = await channel.transcribe_audio("/tmp/does-not-matter.wav") + class _StubGroq: + def __init__(self, api_key=None, api_base=None, language=None, model=None): + calls.append({ + "provider": "groq", + "api_key": api_key, + "api_base": api_base, + "language": language, + "model": model, + }) - assert result == "ok" - assert captured["api_key"] == "k" - assert captured["api_base"] == "http://override/v1/audio/transcriptions" - assert captured["language"] == "en" + async def transcribe(self, file_path): + return "groq-ok" + + with ( + patch.object(transcription_mod, "OpenAITranscriptionProvider", _StubOpenAI), + patch.object(transcription_mod, "GroqTranscriptionProvider", _StubGroq), + ): + assert await channel.transcribe_audio("/tmp/does-not-matter.wav") == "openai-ok" + + config.transcription.provider = "groq" + config.transcription.model = "whisper-large-v3-turbo" + config.transcription.language = "ko" + config.providers.groq.api_key = "groq-key" + config.providers.groq.api_base = "http://groq.local/v1/audio/transcriptions" + save_config(config, config_path) + + assert await channel.transcribe_audio("/tmp/does-not-matter.wav") == "groq-ok" + + assert calls == [ + { + "provider": "openai", + "api_key": "openai-key", + "api_base": "http://openai.local/v1/audio/transcriptions", + "language": "en", + "model": "whisper-custom", + }, + { + "provider": "groq", + "api_key": "groq-key", + "api_base": "http://groq.local/v1/audio/transcriptions", + "language": "ko", + "model": "whisper-large-v3-turbo", + }, + ] + + +@pytest.mark.asyncio +async def test_base_channel_respects_disabled_transcription_config( + tmp_path, + monkeypatch: pytest.MonkeyPatch, +): + config_path = tmp_path / "config.json" + config = Config() + config.transcription.enabled = False + config.providers.groq.api_key = "groq-key" + save_config(config, config_path) + monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path) + + channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus()) + + with patch("nanobot.providers.transcription.GroqTranscriptionProvider") as provider: + assert await channel.transcribe_audio("/tmp/does-not-matter.wav") == "" + provider.assert_not_called() def test_openai_transcription_provider_honors_api_base_argument(): @@ -348,37 +350,6 @@ def test_openai_transcription_provider_honors_api_base_argument(): assert custom.api_url == "http://override/v1/audio/transcriptions" -@pytest.mark.asyncio -async def test_base_channel_passes_language_to_groq_transcription_provider(): - """BaseChannel.transcribe_audio must forward transcription_language to Groq.""" - from nanobot.providers import transcription as transcription_mod - - channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus()) - channel.transcription_provider = "groq" - channel.transcription_api_key = "k" - channel.transcription_api_base = "http://override/v1/audio/transcriptions" - channel.transcription_language = "ko" - - captured: dict[str, object] = {} - - class _StubGroq: - def __init__(self, api_key=None, api_base=None, language=None): - captured["api_key"] = api_key - captured["api_base"] = api_base - captured["language"] = language - - async def transcribe(self, file_path): - return "ok" - - with patch.object(transcription_mod, "GroqTranscriptionProvider", _StubGroq): - result = await channel.transcribe_audio("/tmp/does-not-matter.wav") - - assert result == "ok" - assert captured["api_key"] == "k" - assert captured["api_base"] == "http://override/v1/audio/transcriptions" - assert captured["language"] == "ko" - - # --------------------------------------------------------------------------- # Transcription provider HTTP tests # --------------------------------------------------------------------------- diff --git a/tests/channels/test_websocket_envelope_media.py b/tests/channels/test_websocket_envelope_media.py index 0b67320da..88c24e479 100644 --- a/tests/channels/test_websocket_envelope_media.py +++ b/tests/channels/test_websocket_envelope_media.py @@ -69,6 +69,7 @@ def _make_channel() -> WebSocketChannel: [ ("data:image/png;base64,AAAA", "image/png"), ("data:image/jpeg;base64,AAAA", "image/jpeg"), + ("data:audio/webm;codecs=opus;base64,AAAA", "audio/webm"), ("data:IMAGE/PNG;base64,AAAA", "image/png"), ("data:image/svg+xml;base64,AAAA", "image/svg+xml"), ("data:text/plain;base64,AAAA", "text/plain"), diff --git a/tests/channels/test_whatsapp_channel.py b/tests/channels/test_whatsapp_channel.py index 5032ca410..cb5fc639b 100644 --- a/tests/channels/test_whatsapp_channel.py +++ b/tests/channels/test_whatsapp_channel.py @@ -271,8 +271,6 @@ async def test_lid_to_phone_cache_resolves_lid_only_messages(): async def test_voice_message_transcription_uses_media_path(): """Voice messages are transcribed when media path is available.""" ch = WhatsAppChannel({"enabled": True, "allowFrom": ["*"]}, MagicMock()) - ch.transcription_provider = "openai" - ch.transcription_api_key = "sk-test" ch._handle_message = AsyncMock() ch.transcribe_audio = AsyncMock(return_value="Hello world") diff --git a/tests/providers/test_transcription.py b/tests/providers/test_transcription.py index 14a784b2e..c669a91d3 100644 --- a/tests/providers/test_transcription.py +++ b/tests/providers/test_transcription.py @@ -8,6 +8,8 @@ from unittest.mock import AsyncMock, patch import httpx import pytest +from nanobot.audio.transcription import resolve_transcription_config +from nanobot.config.schema import Config from nanobot.providers.transcription import ( GroqTranscriptionProvider, OpenAITranscriptionProvider, @@ -33,6 +35,65 @@ def _raw_response(status: int, content: bytes) -> httpx.Response: return httpx.Response(status_code=status, content=content, request=request) +def test_resolver_uses_legacy_channel_provider_when_top_level_is_unset() -> None: + config = Config() + config.channels.transcription_provider = "openai" + config.channels.transcription_language = "en" + config.providers.openai.api_key = "sk-test" + config.providers.openai.api_base = "https://proxy.example/v1" + + resolved = resolve_transcription_config(config) + + assert resolved.provider == "openai" + assert resolved.model == "whisper-1" + assert resolved.language == "en" + assert resolved.api_key == "sk-test" + assert resolved.api_base == "https://proxy.example/v1" + assert resolved.configured is True + + +def test_resolver_prefers_top_level_transcription_over_legacy_channels() -> None: + config = Config() + config.channels.transcription_provider = "openai" + config.channels.transcription_language = "en" + config.transcription.provider = "groq" + config.transcription.model = "whisper-large-v3-turbo" + config.transcription.language = "ko" + config.providers.groq.api_key = "gsk-test" + config.providers.groq.api_base = "https://groq.example/openai/v1" + + resolved = resolve_transcription_config(config) + + assert resolved.provider == "groq" + assert resolved.model == "whisper-large-v3-turbo" + assert resolved.language == "ko" + assert resolved.api_key == "gsk-test" + assert resolved.api_base == "https://groq.example/openai/v1" + + +def test_resolved_transcription_repr_hides_api_key() -> None: + config = Config() + config.providers.groq.api_key = "gsk-secret" + + resolved = resolve_transcription_config(config) + + assert "gsk-secret" not in repr(resolved) + assert "api_key" not in repr(resolved) + + +def test_resolver_keeps_enabled_and_limits_on_effective_config() -> None: + config = Config() + config.transcription.enabled = False + config.transcription.max_duration_sec = 45 + config.transcription.max_upload_mb = 12 + + resolved = resolve_transcription_config(config) + + assert resolved.enabled is False + assert resolved.max_duration_sec == 45 + assert resolved.max_upload_mb == 12 + + # --------------------------------------------------------------------------- # OpenAI provider — retry on transient HTTP + network errors # --------------------------------------------------------------------------- @@ -215,6 +276,32 @@ async def test_provider_omits_language_when_unset( assert "language" not in files +@pytest.mark.asyncio +async def test_provider_forwards_custom_model_in_multipart(audio_file: Path) -> None: + provider = GroqTranscriptionProvider(api_key="k", model="whisper-large-v3-turbo") + post = AsyncMock(return_value=_response(200, {"text": "ok"})) + with patch("httpx.AsyncClient.post", post), patch("asyncio.sleep", AsyncMock()): + result = await provider.transcribe(audio_file) + + assert result == "ok" + files = post.await_args_list[0].kwargs["files"] + assert files["model"] == (None, "whisper-large-v3-turbo") + + +@pytest.mark.asyncio +async def test_provider_forwards_file_mime_type(tmp_path: Path) -> None: + audio = tmp_path / "voice.webm" + audio.write_bytes(b"audio") + provider = GroqTranscriptionProvider(api_key="k") + post = AsyncMock(return_value=_response(200, {"text": "ok"})) + with patch("httpx.AsyncClient.post", post), patch("asyncio.sleep", AsyncMock()): + result = await provider.transcribe(audio) + + assert result == "ok" + files = post.await_args_list[0].kwargs["files"] + assert files["file"] == ("voice.webm", b"audio", "audio/webm") + + @pytest.mark.asyncio async def test_language_survives_retry(audio_file: Path) -> None: """Regression: language must be present on every retry attempt, not just the first.""" diff --git a/tests/tools/test_exec_session_tools.py b/tests/tools/test_exec_session_tools.py index 2c99a2c3b..3ef3f37b8 100644 --- a/tests/tools/test_exec_session_tools.py +++ b/tests/tools/test_exec_session_tools.py @@ -6,8 +6,12 @@ import shlex import subprocess import sys +from nanobot.agent.tools.exec_session import ( + ExecSessionManager, + ListExecSessionsTool, + WriteStdinTool, +) from nanobot.agent.tools.shell import ExecTool -from nanobot.agent.tools.exec_session import ExecSessionManager, ListExecSessionsTool, WriteStdinTool def _python_command(code: str) -> str: @@ -141,7 +145,7 @@ def test_exec_can_continue_with_stdin(tmp_path): return initial, result initial, result = asyncio.run(run()) - assert "ready" in initial + assert "ready" in initial + result assert "Process running" in initial assert "Elapsed:" in initial assert "got:ping" in result @@ -170,7 +174,7 @@ def test_write_stdin_can_close_stdin(tmp_path): return initial, result initial, result = asyncio.run(run()) - assert "ready" in initial + assert "ready" in initial + result assert "got:payload" in result assert "Stdin closed." in result assert "Exit code: 0" in result @@ -185,14 +189,20 @@ def test_write_stdin_can_terminate_session(tmp_path): "import time; print('ready', flush=True); time.sleep(30)" ) - initial = await exec_tool.execute(command=command, yield_time_ms=500) + initial = await exec_tool.execute(command=command, yield_time_ms=100) sid = _session_id(initial) + waited = await stdin_tool.execute( + session_id=sid, + wait_for="ready", + wait_timeout_ms=3000, + yield_time_ms=0, + ) result = await stdin_tool.execute( session_id=sid, terminate=True, yield_time_ms=0, ) - return initial, result + return initial + waited, result initial, result = asyncio.run(run()) assert "ready" in initial @@ -243,7 +253,7 @@ def test_write_stdin_preserves_completed_session_output_until_polled(tmp_path): initial, final = asyncio.run(run()) - assert "ready" in initial + assert "ready" in initial + final assert "done" in final assert "Exit code: 0" in final diff --git a/tests/utils/test_media_decode.py b/tests/utils/test_media_decode.py index 5926ab2b6..a0f357c4a 100644 --- a/tests/utils/test_media_decode.py +++ b/tests/utils/test_media_decode.py @@ -8,8 +8,8 @@ import pytest from nanobot.utils.media_decode import ( DEFAULT_MAX_BYTES, - FileSizeExceeded, MAX_FILE_SIZE, + FileSizeExceeded, save_base64_data_url, ) @@ -25,6 +25,31 @@ def test_saves_png_with_correct_extension(tmp_path) -> None: assert (tmp_path / result.split("/")[-1]).read_bytes() == b"fake png" +def test_saves_data_url_with_mime_parameters(tmp_path) -> None: + result = save_base64_data_url(_data_url(b"voice", mime="audio/webm;codecs=opus"), tmp_path) + assert result is not None + assert result.endswith(".webm") + assert (tmp_path / result.split("/")[-1]).read_bytes() == b"voice" + + +@pytest.mark.parametrize( + ("mime", "suffix"), + [ + ("audio/webm", ".webm"), + ("video/webm", ".webm"), + ("audio/ogg", ".ogg"), + ("audio/wav", ".wav"), + ("audio/mpga", ".mpga"), + ], +) +def test_saves_common_audio_with_api_friendly_extension( + tmp_path, mime: str, suffix: str +) -> None: + result = save_base64_data_url(_data_url(b"voice", mime=mime), tmp_path) + assert result is not None + assert result.endswith(suffix) + + def test_returns_none_for_malformed_data_url(tmp_path) -> None: assert save_base64_data_url("not-a-data-url", tmp_path) is None diff --git a/tests/webui/test_settings_api.py b/tests/webui/test_settings_api.py index d48dd6bd1..b9043816c 100644 --- a/tests/webui/test_settings_api.py +++ b/tests/webui/test_settings_api.py @@ -18,6 +18,7 @@ from nanobot.webui.settings_api import ( update_agent_settings, update_model_configuration, update_network_safety_settings, + update_transcription_settings, ) @@ -243,6 +244,75 @@ def test_settings_payload_includes_network_safety_fields( assert payload["advanced"]["ssrf_whitelist_count"] == 1 +def test_settings_payload_includes_effective_transcription_config( + tmp_path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + config_path = tmp_path / "config.json" + config = Config() + config.channels.transcription_provider = "openai" + config.channels.transcription_language = "en" + config.providers.openai.api_key = "sk-test" + save_config(config, config_path) + monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path) + + payload = settings_payload() + + assert payload["transcription"]["enabled"] is True + assert payload["transcription"]["provider"] == "openai" + assert payload["transcription"]["provider_configured"] is True + assert payload["transcription"]["model"] == "whisper-1" + assert payload["transcription"]["language"] == "en" + + +def test_update_transcription_settings_writes_top_level_only( + tmp_path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + config_path = tmp_path / "config.json" + config = Config() + config.channels.transcription_provider = "openai" + config.channels.transcription_language = "en" + config.providers.groq.api_key = "gsk-test" + save_config(config, config_path) + monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path) + + payload = update_transcription_settings( + { + "enabled": ["true"], + "provider": ["groq"], + "model": ["whisper-large-v3-turbo"], + "language": ["ko"], + "maxDurationSec": ["90"], + "maxUploadMb": ["20"], + } + ) + + saved = load_config(config_path) + assert saved.channels.transcription_provider == "openai" + assert saved.channels.transcription_language == "en" + assert saved.transcription.enabled is True + assert saved.transcription.provider == "groq" + assert saved.transcription.model == "whisper-large-v3-turbo" + assert saved.transcription.language == "ko" + assert saved.transcription.max_duration_sec == 90 + assert saved.transcription.max_upload_mb == 20 + assert payload["transcription"]["provider"] == "groq" + assert payload["transcription"]["provider_configured"] is True + + +def test_update_transcription_settings_validates_language( + tmp_path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + config_path = tmp_path / "config.json" + save_config(Config(), config_path) + monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path) + + with pytest.raises(WebUISettingsError, match="transcription language"): + update_transcription_settings({"language": ["en-US"]}) + + def test_settings_payload_includes_token_usage_summary( tmp_path, monkeypatch: pytest.MonkeyPatch, diff --git a/tests/webui/test_transcription_ws.py b/tests/webui/test_transcription_ws.py new file mode 100644 index 000000000..3cc3770f0 --- /dev/null +++ b/tests/webui/test_transcription_ws.py @@ -0,0 +1,129 @@ +"""Tests for WebUI transcription envelopes carried over the gateway socket.""" + +from __future__ import annotations + +import base64 +from pathlib import Path +from typing import Any + +import pytest + +from nanobot.config.loader import save_config +from nanobot.config.schema import Config +from nanobot.webui.transcription_ws import webui_transcription_event + + +def _audio_data_url(payload: bytes = b"voice", mime: str = "audio/webm") -> str: + return f"data:{mime};base64,{base64.b64encode(payload).decode('ascii')}" + + +@pytest.mark.asyncio +async def test_webui_transcribe_audio_rejects_unconfigured_provider( + tmp_path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + config_path = tmp_path / "config.json" + config = Config() + config.transcription.provider = "groq" + save_config(config, config_path) + monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path) + + event, payload = await webui_transcription_event({ + "request_id": "voice-1", + "data_url": _audio_data_url(), + }) + + assert event == "transcription_error" + assert payload == { + "request_id": "voice-1", + "detail": "not_configured", + "provider": "groq", + } + + +@pytest.mark.asyncio +async def test_webui_transcribe_audio_rejects_unsupported_mime( + tmp_path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + config_path = tmp_path / "config.json" + config = Config() + config.transcription.provider = "groq" + config.providers.groq.api_key = "gsk-test" + save_config(config, config_path) + monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path) + + event, payload = await webui_transcription_event({ + "request_id": "voice-1", + "data_url": _audio_data_url(mime="text/plain"), + }) + + assert event == "transcription_error" + assert payload["request_id"] == "voice-1" + assert payload["detail"] == "mime" + + +@pytest.mark.asyncio +async def test_webui_transcribe_audio_rejects_oversized_audio( + tmp_path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + config_path = tmp_path / "config.json" + config = Config() + config.transcription.provider = "groq" + config.transcription.max_upload_mb = 1 + config.providers.groq.api_key = "gsk-test" + save_config(config, config_path) + monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path) + monkeypatch.setattr("nanobot.audio.transcription.get_media_dir", lambda _channel=None: tmp_path) + + event, payload = await webui_transcription_event({ + "request_id": "voice-1", + "data_url": _audio_data_url(payload=b"x" * (1024 * 1024 + 1)), + }) + + assert event == "transcription_error" + assert payload["request_id"] == "voice-1" + assert payload["detail"] == "size" + + +@pytest.mark.asyncio +async def test_webui_transcribe_audio_returns_text_and_removes_temp_file( + tmp_path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + config_path = tmp_path / "config.json" + media_dir = tmp_path / "media" + media_dir.mkdir() + config = Config() + config.transcription.provider = "groq" + config.providers.groq.api_key = "gsk-test" + save_config(config, config_path) + monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path) + monkeypatch.setattr( + "nanobot.audio.transcription.get_media_dir", + lambda _channel=None: media_dir, + ) + captured_paths: list[Path] = [] + + async def fake_transcribe_audio_file(path: str | Path, _resolved: Any) -> str: + p = Path(path) + assert p.exists() + captured_paths.append(p) + return "hello voice" + + monkeypatch.setattr( + "nanobot.audio.transcription.transcribe_audio_file", + fake_transcribe_audio_file, + ) + + event, payload = await webui_transcription_event({ + "request_id": "voice-1", + "data_url": _audio_data_url(payload=b"webm voice", mime="audio/webm;codecs=opus"), + "duration_ms": 1200, + }) + + assert event == "transcription_result" + assert payload == {"request_id": "voice-1", "text": "hello voice"} + assert captured_paths + assert not captured_paths[0].exists() diff --git a/webui/src/App.tsx b/webui/src/App.tsx index 95e4c57ec..4fe6d20e7 100644 --- a/webui/src/App.tsx +++ b/webui/src/App.tsx @@ -81,6 +81,7 @@ const SETTINGS_SECTION_KEYS: SettingsSectionKey[] = [ "appearance", "models", "image", + "voice", "browser", "apps", "skills", diff --git a/webui/src/components/CodeBlock.tsx b/webui/src/components/CodeBlock.tsx index 289726960..5fd1c51a9 100644 --- a/webui/src/components/CodeBlock.tsx +++ b/webui/src/components/CodeBlock.tsx @@ -1,8 +1,9 @@ -import { Suspense, lazy, useCallback, useState } from "react"; +import { Suspense, lazy, useCallback, useState, type ReactNode } from "react"; import { Check, Copy } from "lucide-react"; import { useTranslation } from "react-i18next"; import { useThemeValue } from "@/hooks/useTheme"; +import { hasAnsi, parseAnsiSegments, stripAnsi } from "@/lib/ansi"; import { cn } from "@/lib/utils"; interface CodeBlockProps { @@ -36,6 +37,10 @@ const CODE_FONT_STACK = [ "monospace", ].join(", "); +const ANSI_LANGUAGES = new Set(["ansi", "ansi-output"]); +const CODE_SURFACE_LIGHT = "#f4f4f5"; +const CODE_SURFACE_DARK = "#27272a"; + const LazyHighlightedCode = lazy(async () => { const [ { default: SyntaxHighlighter }, @@ -74,7 +79,11 @@ const LazyHighlightedCode = lazy(async () => { language={language || "text"} style={transparentTheme} customStyle={{ - background: chrome === "none" ? "transparent" : undefined, + background: chrome === "none" + ? "transparent" + : isDark + ? CODE_SURFACE_DARK + : CODE_SURFACE_LIGHT, margin: 0, padding: chrome === "none" ? "0.75rem 1rem" : "1rem", fontFamily: CODE_FONT_STACK, @@ -83,10 +92,10 @@ const LazyHighlightedCode = lazy(async () => { tabSize: 2, }} codeTagProps={{ - style: chrome === "none" ? { + style: { background: "transparent", fontFamily: CODE_FONT_STACK, - } : undefined, + }, }} lineNumberStyle={{ minWidth: "2.6em", @@ -106,14 +115,32 @@ const LazyHighlightedCode = lazy(async () => { }; }); -function PlainCodeFallback({ +function renderPlainText(value: string): ReactNode { + return value; +} + +function renderAnsiText(value: string): ReactNode { + return parseAnsiSegments(value).map((segment, index) => ( + + {segment.text} + + )); +} + +function CodeTextBlock({ code, chrome, showLineNumbers, + testId, + className, + renderText = renderPlainText, }: { code: string; chrome: "default" | "none"; showLineNumbers: boolean; + testId: string; + className?: string; + renderText?: (value: string) => ReactNode; }) { const lines = code.split("\n"); return ( @@ -121,10 +148,11 @@ function PlainCodeFallback({ className={cn( "m-0 overflow-x-auto p-4 font-mono text-sm leading-[1.6] text-foreground/90", showLineNumbers ? "whitespace-pre" : "whitespace-pre-wrap", - chrome === "default" ? "bg-background" : "bg-transparent", + chrome === "default" ? "bg-zinc-100 dark:bg-zinc-800" : "bg-transparent", chrome === "none" && "p-3 text-[13px] leading-[1.55]", + className, )} - data-testid="plain-code-fallback" + data-testid={testId} > {showLineNumbers ? ( @@ -133,16 +161,21 @@ function PlainCodeFallback({ {index + 1} - {line || " "} + {renderText(line || " ")} {index < lines.length - 1 ? "\n" : null} )) - ) : code} + ) : renderText(code)} ); } +function shouldRenderAnsi(language: string | undefined, code: string): boolean { + const normalized = language?.trim().toLowerCase(); + return Boolean((normalized && ANSI_LANGUAGES.has(normalized)) || hasAnsi(code)); +} + export function CodeBlock({ language, code, @@ -156,19 +189,20 @@ export function CodeBlock({ const [copied, setCopied] = useState(false); const isDark = useThemeValue() === "dark"; const hasChrome = chrome === "default"; + const renderAnsi = shouldRenderAnsi(language, code); const onCopy = useCallback(() => { if (!navigator.clipboard) return; - navigator.clipboard.writeText(code).then(() => { + navigator.clipboard.writeText(renderAnsi ? stripAnsi(code) : code).then(() => { setCopied(true); setTimeout(() => setCopied(false), 1_500); }); - }, [code]); + }, [code, renderAnsi]); return (
) : null} - {highlight ? ( + {renderAnsi ? ( + + ) : highlight ? ( } > @@ -226,10 +269,11 @@ export function CodeBlock({ /> ) : ( - )} diff --git a/webui/src/components/settings/SettingsView.tsx b/webui/src/components/settings/SettingsView.tsx index fd726ea89..c06bd41ae 100644 --- a/webui/src/components/settings/SettingsView.tsx +++ b/webui/src/components/settings/SettingsView.tsx @@ -31,6 +31,7 @@ import { Layers, Loader2, LogOut, + Mic, Moon, PlayCircle, Plus, @@ -92,6 +93,7 @@ import { updateNetworkSafetySettings, updateProviderSettings, updateSettings, + updateTranscriptionSettings, updateWebSearchSettings, } from "@/lib/api"; import { notifyCliAppsChanged } from "@/lib/cli-app-events"; @@ -115,6 +117,7 @@ import type { ProviderModelsPayload, SettingsPayload, SkillSummary, + TranscriptionSettingsUpdate, WebSearchSettingsUpdate, WebuiDefaultAccessMode, } from "@/lib/types"; @@ -124,6 +127,7 @@ export type SettingsSectionKey = | "appearance" | "models" | "image" + | "voice" | "browser" | "apps" | "skills" @@ -367,6 +371,26 @@ const DEFAULT_IMAGE_GENERATION_FORM: ImageGenerationSettingsUpdate = { maxImagesPerTurn: 4, }; +const DEFAULT_TRANSCRIPTION_FORM: TranscriptionSettingsUpdate = { + enabled: true, + provider: "groq", + model: "", + language: "", + maxDurationSec: 120, + maxUploadMb: 25, +}; + +const DEFAULT_TRANSCRIPTION_SETTINGS: NonNullable = { + enabled: true, + provider: "groq", + provider_configured: false, + model: "whisper-large-v3", + language: null, + max_duration_sec: 120, + max_upload_mb: 25, + providers: [], +}; + const DEFAULT_NETWORK_SAFETY_FORM: NetworkSafetySettingsUpdate = { webuiAllowLocalServiceAccess: true, webuiDefaultAccessMode: "default", @@ -419,6 +443,18 @@ function imageGenerationFormFromPayload(payload: SettingsPayload): ImageGenerati }; } +function transcriptionFormFromPayload(payload: SettingsPayload): TranscriptionSettingsUpdate { + const transcription = payload.transcription ?? DEFAULT_TRANSCRIPTION_SETTINGS; + return { + enabled: transcription.enabled, + provider: transcription.provider, + model: transcription.model, + language: transcription.language ?? "", + maxDurationSec: transcription.max_duration_sec, + maxUploadMb: transcription.max_upload_mb, + }; +} + function networkSafetyFormFromPayload(payload: SettingsPayload): NetworkSafetySettingsUpdate { return { webuiAllowLocalServiceAccess: @@ -479,6 +515,7 @@ export function SettingsView({ const [providerSaving, setProviderSaving] = useState(null); const [webSearchSaving, setWebSearchSaving] = useState(false); const [imageGenerationSaving, setImageGenerationSaving] = useState(false); + const [transcriptionSaving, setTranscriptionSaving] = useState(false); const [networkSafetySaving, setNetworkSafetySaving] = useState(false); const [hostEngineApplying, setHostEngineApplying] = useState(false); const [error, setError] = useState(null); @@ -511,6 +548,9 @@ export function SettingsView({ ? imageGenerationFormFromPayload(initialSettings) : DEFAULT_IMAGE_GENERATION_FORM, ); + const [transcriptionForm, setTranscriptionForm] = useState( + () => initialSettings ? transcriptionFormFromPayload(initialSettings) : DEFAULT_TRANSCRIPTION_FORM, + ); const [networkSafetyForm, setNetworkSafetyForm] = useState(() => initialSettings ? networkSafetyFormFromPayload(initialSettings) : DEFAULT_NETWORK_SAFETY_FORM, ); @@ -543,6 +583,7 @@ export function SettingsView({ setForm(agentDraftFromPayload(payload)); setWebSearchForm((prev) => webSearchFormFromPayload(payload, prev)); setImageGenerationForm(imageGenerationFormFromPayload(payload)); + setTranscriptionForm(transcriptionFormFromPayload(payload)); setNetworkSafetyForm(networkSafetyFormFromPayload(payload)); if (payload.restart_required_sections) { setPendingRestartSections(pendingRestartSectionsFromPayload(payload)); @@ -711,6 +752,19 @@ export function SettingsView({ ); }, [imageGenerationForm, settings]); + const transcriptionDirty = useMemo(() => { + if (!settings) return false; + const transcription = settings.transcription ?? DEFAULT_TRANSCRIPTION_SETTINGS; + return ( + transcriptionForm.enabled !== transcription.enabled || + transcriptionForm.provider !== transcription.provider || + transcriptionForm.model !== transcription.model || + transcriptionForm.language !== (transcription.language ?? "") || + transcriptionForm.maxDurationSec !== transcription.max_duration_sec || + transcriptionForm.maxUploadMb !== transcription.max_upload_mb + ); + }, [settings, transcriptionForm]); + const networkSafetyDirty = useMemo(() => { if (!settings) return false; const currentLocalServiceAccess = @@ -913,6 +967,24 @@ export function SettingsView({ } }; + const saveTranscriptionSettings = async () => { + if (!settings || !transcriptionDirty || transcriptionSaving) return; + setTranscriptionSaving(true); + try { + const payload = await updateTranscriptionSettings(token, transcriptionForm); + applyPayload(payload); + if (payload.requires_restart) { + setPendingRestartSections((prev) => ({ ...prev, browser: true })); + } + await maybeRestartHostEngine(payload); + setError(null); + } catch (err) { + setError((err as Error).message); + } finally { + setTranscriptionSaving(false); + } + }; + const saveNetworkSafetySettings = async () => { if (!settings || !networkSafetyDirty || networkSafetySaving) return; setNetworkSafetySaving(true); @@ -1333,6 +1405,22 @@ export function SettingsView({ requiresRestartPending={pendingRestartSections.image} /> ); + case "voice": + return ( + selectSection("models")} + showBrandLogos={localPrefs.brandLogos} + onRestart={restartViaSettingsSurface} + isRestarting={isRestarting || hostEngineApplying} + requiresRestartPending={pendingRestartSections.browser} + /> + ); case "browser": return ( provider.name === settings.web_search.provider) ?? + settings.web_search.providers[0]; + const webSearchProviderLabel = providerDisplayLabel( + settings.web_search.providers, + settings.web_search.provider, + ); + const webSearchCredentialStatus = + webSearchProvider?.credential === "none" + ? tx("settings.byok.webSearch.noCredentialRequired", "No key required") + : webSearchProvider?.credential === "base_url" + ? settings.web_search.base_url + ? tx("settings.values.configured", "Configured") + : tx("settings.values.notConfigured", "Not configured") + : settings.web_search.api_key_hint + ? tx("settings.values.configured", "Configured") + : tx("settings.values.notConfigured", "Not configured"); + const webCaption = `${webSearchProviderLabel} · ${webSearchCredentialStatus}`; const imageStatus = settings.image_generation.enabled ? tx("settings.values.enabled", "Enabled") : tx("settings.values.disabled", "Disabled"); @@ -1650,6 +1757,15 @@ function OverviewSettings({ ? tx("settings.values.configured", "Configured") : tx("settings.values.notConfigured", "Not configured") }`; + const transcription = settings.transcription ?? DEFAULT_TRANSCRIPTION_SETTINGS; + const voiceStatus = transcription.enabled + ? tx("settings.values.enabled", "Enabled") + : tx("settings.values.disabled", "Disabled"); + const voiceCaption = `${providerDisplayLabel(transcription.providers, transcription.provider)} · ${ + transcription.provider_configured + ? tx("settings.values.configured", "Configured") + : tx("settings.values.notConfigured", "Not configured") + }`; const isNativeHost = (settings.surface ?? settings.runtime_surface) === "native"; const workspaceCaption = shortWorkspacePath(settings.runtime.workspace_path); const runtimeTitle = isNativeHost @@ -1691,8 +1807,8 @@ function OverviewSettings({ icon={Globe2} valueLogoProvider={settings.web_search.provider} title={tx("settings.overview.webSearch", "Web search")} - value={providerDisplayLabel(settings.web_search.providers, settings.web_search.provider)} - caption={webStatus} + value={webStatus} + caption={webCaption} showBrandLogos={showBrandLogos} onClick={() => onSelectSection("browser")} /> @@ -1705,6 +1821,15 @@ function OverviewSettings({ showBrandLogos={showBrandLogos} onClick={() => onSelectSection("image")} /> + onSelectSection("voice")} + /> @@ -2654,6 +2779,137 @@ function ImageGenerationSettings({ ); } +function TranscriptionSettings({ + settings, + form, + dirty, + saving, + onChangeForm, + onSave, + onOpenProviders, + showBrandLogos, + onRestart, + isRestarting, + requiresRestartPending, +}: { + settings: SettingsPayload; + form: TranscriptionSettingsUpdate; + dirty: boolean; + saving: boolean; + onChangeForm: Dispatch>; + onSave: () => void; + onOpenProviders: () => void; + showBrandLogos: boolean; + onRestart?: () => void; + isRestarting?: boolean; + requiresRestartPending: boolean; +}) { + const { t } = useTranslation(); + const tx = (key: string, fallback: string) => t(key, { defaultValue: fallback }); + const transcription = settings.transcription ?? DEFAULT_TRANSCRIPTION_SETTINGS; + const selectedProvider = + transcription.providers.find((provider) => provider.name === form.provider) ?? + transcription.providers[0]; + const providerConfigured = !!selectedProvider?.configured; + + return ( +
+ {tx("settings.sections.voiceInput", "Voice input")} + + + onChangeForm((prev) => ({ ...prev, enabled }))} + ariaLabel={tx("settings.rows.transcription", "Transcription")} + label={form.enabled ? tx("settings.values.on", "On") : tx("settings.values.off", "Off")} + /> + + + onChangeForm((prev) => ({ ...prev, provider }))} + /> + + +
+ + {providerConfigured + ? tx("settings.values.configured", "Configured") + : tx("settings.values.notConfigured", "Not configured")} + + {!providerConfigured ? ( + + ) : null} +
+
+ + onChangeForm((prev) => ({ ...prev, model: event.target.value }))} + className="h-8 w-[min(300px,70vw)] rounded-full text-[13px]" + /> + + + onChangeForm((prev) => ({ ...prev, language: event.target.value }))} + placeholder={tx("settings.voice.languageAuto", "Auto")} + className="h-8 w-[min(180px,60vw)] rounded-full text-[13px]" + /> + + +
+ onChangeForm((prev) => ({ ...prev, maxDurationSec }))} + /> + onChangeForm((prev) => ({ ...prev, maxUploadMb }))} + /> +
+
+ +
+
+ ); +} + function WebSettings({ settings, form, diff --git a/webui/src/components/settings/TokenUsageHeatmap.tsx b/webui/src/components/settings/TokenUsageHeatmap.tsx index 488f45f8e..3e5939e12 100644 --- a/webui/src/components/settings/TokenUsageHeatmap.tsx +++ b/webui/src/components/settings/TokenUsageHeatmap.tsx @@ -78,16 +78,13 @@ function buildTokenUsageCalendar( const today = utcDateFromIsoDay(isoDayInTimeZone(new Date(), timeZone)); const end = addUtcDays(today, 6 - today.getUTCDay()); const start = addUtcDays(end, -(TOKEN_HEATMAP_CELLS - 1)); - const seenMonths = new Set(); const monthLabels: TokenUsageMonthLabel[] = []; const cells = Array.from({ length: TOKEN_HEATMAP_CELLS }, (_, index) => { const date = addUtcDays(start, index); const key = isoDay(date); const row = byDate.get(key); - const monthKey = key.slice(0, 7); - if (!seenMonths.has(monthKey)) { - seenMonths.add(monthKey); + if (date.getUTCDate() === 1) { monthLabels.push({ label: monthFormatter.format(date), column: Math.floor(index / 7) + 1, @@ -186,16 +183,12 @@ export function TokenUsageHeatmap({ {tx("settings.usage.shortTitle", "Token Usage")} -
+
{monthLabels.map((month) => ( {month.label} diff --git a/webui/src/components/thread/ThreadComposer.tsx b/webui/src/components/thread/ThreadComposer.tsx index 1c0c7cbdc..fba1a46fd 100644 --- a/webui/src/components/thread/ThreadComposer.tsx +++ b/webui/src/components/thread/ThreadComposer.tsx @@ -31,6 +31,7 @@ import { History, ImageIcon, Loader2, + Mic, Plus, RotateCw, Shield, @@ -46,6 +47,12 @@ import { import { useTranslation } from "react-i18next"; import { Button } from "@/components/ui/button"; +import { + Tooltip, + TooltipContent, + TooltipProvider, + TooltipTrigger, +} from "@/components/ui/tooltip"; import { WorkspaceAccessMenu, WorkspaceProjectPicker, @@ -59,6 +66,7 @@ import { } from "@/hooks/useAttachedImages"; import { useClipboardAndDrop } from "@/hooks/useClipboardAndDrop"; import type { SendImage, SendOptions } from "@/hooks/useNanobotStream"; +import { useVoiceRecorder, type VoiceRecorderErrorKey } from "@/hooks/useVoiceRecorder"; import type { CliAppInfo, GoalStateWsPayload, @@ -79,6 +87,9 @@ import { cn } from "@/lib/utils"; /** ````: aligned with the server's MIME whitelist. SVG is * deliberately excluded to avoid an embedded-script XSS surface. */ const ACCEPT_ATTR = "image/png,image/jpeg,image/webp,image/gif"; +const VOICE_SHORTCUT_CODE = "KeyD"; +const VOICE_SHORTCUT_ARIA = "Control+Shift+D"; +type VoiceShortcutPlatform = "apple" | "chromeos" | "linux" | "other" | "windows"; function formatBytes(n: number): string { if (n < 1024) return `${n} B`; @@ -86,6 +97,54 @@ function formatBytes(n: number): string { return `${(n / (1024 * 1024)).toFixed(1)} MB`; } +function isVoiceShortcutDown(event: KeyboardEvent): boolean { + return ( + event.code === VOICE_SHORTCUT_CODE + && event.ctrlKey + && event.shiftKey + && !event.altKey + && !event.metaKey + ); +} + +function isVoiceShortcutRelease(event: KeyboardEvent): boolean { + return ( + event.code === VOICE_SHORTCUT_CODE + || event.key === "Control" + || event.key === "Shift" + ); +} + +function getVoiceShortcutPlatform(): VoiceShortcutPlatform { + if (typeof navigator === "undefined") return "other"; + const userAgentData = (navigator as Navigator & { userAgentData?: { platform?: string } }) + .userAgentData; + const platform = [ + userAgentData?.platform, + navigator.platform, + navigator.userAgent, + ].filter(Boolean).join(" ").toLowerCase(); + const isIpadPretendingToBeMac = + navigator.platform === "MacIntel" && navigator.maxTouchPoints > 1; + if (isIpadPretendingToBeMac || /mac|iphone|ipad|ipod/.test(platform)) return "apple"; + if (/win/.test(platform)) return "windows"; + if (/cros/.test(platform)) return "chromeos"; + if (/linux|x11|android/.test(platform)) return "linux"; + return "other"; +} + +function getVoiceShortcutLabel(): string { + switch (getVoiceShortcutPlatform()) { + case "apple": + return "⌃⇧D"; + case "chromeos": + case "linux": + case "windows": + case "other": + return "Ctrl ⇧ D"; + } +} + interface ThreadComposerProps { onSend: (content: string, images?: SendImage[], options?: SendOptions) => void; disabled?: boolean; @@ -101,6 +160,7 @@ interface ThreadComposerProps { cliApps?: CliAppInfo[]; mcpPresets?: McpPresetInfo[]; onStop?: () => void; + onTranscribeAudio?: (dataUrl: string, options?: { durationMs?: number }) => Promise; /** Unix seconds from server; turn elapsed timer above input while set. */ runStartedAt?: number | null; /** Sustained objective for this chat (WebSocket ``goal_state``). */ @@ -138,6 +198,45 @@ const QUEUED_PROMPTS_STORAGE_PREFIX = "nanobot.webui.composerQueuedGuidance.v1:" const QUEUED_PROMPTS_LIMIT = 20; const QUEUED_PROMPT_MAX_CHARS = 4000; +function VoiceRecordingMeter({ + ariaLabel, + className, + elapsedLabel, + isHero, + levels, +}: { + ariaLabel: string; + className?: string; + elapsedLabel: string; + isHero: boolean; + levels: number[]; +}) { + return ( +
+ + {levels.map((height, index) => ( + + ))} + + + {elapsedLabel} + +
+ ); +} + type SlashPalettePlacement = "above" | "below"; interface SlashPaletteLayout { @@ -656,6 +755,7 @@ export function ThreadComposer({ cliApps = [], mcpPresets = [], onStop, + onTranscribeAudio, runStartedAt = null, goalState, workspaceScope = null, @@ -685,7 +785,9 @@ export function ThreadComposer({ const wasStreamingRef = useRef(isStreaming); const skipNextQueuedFlushRef = useRef(false); const skipQueuedPromptPersistRef = useRef(false); + const voiceShortcutDownRef = useRef(false); const isHero = variant === "hero"; + const voiceShortcutLabel = useMemo(getVoiceShortcutLabel, []); const queuedPromptStorageKey = useMemo( () => queuedPromptsStorageKey(pendingQueueKey), [pendingQueueKey], @@ -1026,6 +1128,65 @@ export function ThreadComposer({ }); }, []); + const appendTranscription = useCallback((text: string) => { + const transcript = text.trim(); + if (!transcript) return; + setValue((current) => { + if (!current.trim()) return transcript; + const separator = /[\s\n]$/.test(current) ? "" : " "; + return `${current}${separator}${transcript}`; + }); + setSlashMenuDismissed(false); + setCliAppMenuDismissed(false); + setInlineError(null); + resizeTextarea(); + }, [resizeTextarea]); + + const clearInlineError = useCallback(() => setInlineError(null), []); + const setVoiceError = useCallback((key: VoiceRecorderErrorKey) => { + setInlineError(t(`thread.composer.voiceErrors.${key}`)); + }, [t]); + const voiceRecorder = useVoiceRecorder({ + disabled, + onClearError: clearInlineError, + onError: setVoiceError, + onTranscript: appendTranscription, + onTranscribeAudio, + }); + + useEffect(() => { + if (!onTranscribeAudio) return; + + function onKeyDown(event: KeyboardEvent): void { + if (!isVoiceShortcutDown(event) || event.repeat || voiceShortcutDownRef.current) return; + event.preventDefault(); + voiceShortcutDownRef.current = true; + voiceRecorder.beginShortcutHold(); + } + + function onKeyUp(event: KeyboardEvent): void { + if (!voiceShortcutDownRef.current || !isVoiceShortcutRelease(event)) return; + event.preventDefault(); + voiceShortcutDownRef.current = false; + voiceRecorder.endShortcutHold(); + } + + function onWindowBlur(): void { + if (!voiceShortcutDownRef.current) return; + voiceShortcutDownRef.current = false; + voiceRecorder.endShortcutHold(); + } + + window.addEventListener("keydown", onKeyDown); + window.addEventListener("keyup", onKeyUp); + window.addEventListener("blur", onWindowBlur); + return () => { + window.removeEventListener("keydown", onKeyDown); + window.removeEventListener("keyup", onKeyUp); + window.removeEventListener("blur", onWindowBlur); + }; + }, [onTranscribeAudio, voiceRecorder.beginShortcutHold, voiceRecorder.endShortcutHold]); + const chooseSlashCommand = useCallback( (command: SlashCommand) => { if (command.command === "/stop" && isStreaming && onStop) { @@ -1341,6 +1502,23 @@ export function ThreadComposer({ ); const attachButtonDisabled = disabled || full; + const showVoiceButton = Boolean(onTranscribeAudio); + const voiceRecordingStatusLabel = t("thread.composer.voice.recordingStatus", { + time: voiceRecorder.elapsedLabel, + defaultValue: `Recording ${voiceRecorder.elapsedLabel}`, + }); + const voiceButtonLabel = + voiceRecorder.state === "recording" + ? t("thread.composer.voice.stop") + : voiceRecorder.state === "transcribing" + ? t("thread.composer.voice.transcribing") + : t("thread.composer.tools.voice"); + const voiceButtonTooltip = + voiceRecorder.state === "recording" + ? t("thread.composer.voice.stop") + : voiceRecorder.state === "transcribing" + ? t("thread.composer.voice.transcribing") + : t("thread.composer.voice.hint"); const showStopButton = isStreaming && !!onStop; const relaxedHeroInput = isHero && images.length === 0 && !isStreaming; const inputTextClasses = cn( @@ -1531,7 +1709,15 @@ export function ThreadComposer({ > - {workspaceScope ? ( + {voiceRecorder.isRecording ? ( + + ) : workspaceScope ? (
- {modelLabel ? ( + {modelLabel && !voiceRecorder.isRecording ? ( ) : null} + {showVoiceButton ? ( + + + + + + + {voiceButtonTooltip} + {voiceRecorder.state === "idle" ? ( + + {voiceShortcutLabel} + + ) : null} + + + + ) : null}