feat(transcription): add shared voice input support (#4232)

* feat(webui): add voice transcription input

* feat(webui): render ANSI output in code blocks

* refactor(webui): isolate voice recorder logic

* refactor(transcription): keep websocket ingress thin

* refactor(transcription): resolve channel audio settings on demand

* style(webui): neutralize voice waveform color

* feat(webui): add voice input tooltip

* feat(webui): add voice input keyboard shortcut

* fix(webui): distinguish voice shortcut platforms

* fix(webui): place voice button after model selector

* refactor(webui): share voice hold recording helpers

* fix(desktop): allow microphone voice input

* fix(webui): stabilize token usage month labels

* feat(webui): show voice input on settings overview

* fix(webui): label voice capability as recognition

* fix(webui): align capability overview status

* refactor(webui): isolate transcription socket handling

* fix(webui): soften silent voice waveform

* refactor(audio): clarify transcription service location

* docs(transcription): clarify audio and provider boundaries

* fix(exec): reduce session output polling flake
This commit is contained in:
Xubin Ren 2026-06-09 01:08:49 +08:00 committed by GitHub
parent 06d454a225
commit 9c81280300
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
49 changed files with 3071 additions and 257 deletions

View File

@ -47,6 +47,9 @@
],
"mac": {
"category": "public.app-category.developer-tools",
"extendInfo": {
"NSMicrophoneUsageDescription": "nanobot uses the microphone to transcribe voice input before you send messages."
},
"target": [
"dmg"
]

View File

@ -15,6 +15,7 @@ import {
protocol,
session,
shell,
systemPreferences,
} from "electron";
import type { IpcMainInvokeEvent, WebContents } from "electron";
@ -100,6 +101,58 @@ function isTrustedAppUrl(rawUrl: string): boolean {
}
}
function isTrustedPermissionRequest(
webContents: WebContents | null,
details: unknown,
): boolean {
return [
permissionDetail(details, "requestingUrl"),
permissionDetail(details, "securityOrigin"),
webContents?.getURL(),
].some((url) => typeof url === "string" && isTrustedAppUrl(url));
}
function permissionDetail(details: unknown, key: string): unknown {
return typeof details === "object" && details !== null
? (details as Record<string, unknown>)[key]
: undefined;
}
function isAudioOnlyMediaRequest(details: unknown): boolean {
const mediaTypes = permissionDetail(details, "mediaTypes");
if (Array.isArray(mediaTypes)) {
return mediaTypes.includes("audio") && !mediaTypes.includes("video");
}
return permissionDetail(details, "mediaType") === "audio";
}
async function requestNativeMicrophoneAccess(): Promise<boolean> {
if (process.platform !== "darwin") return true;
const status = systemPreferences.getMediaAccessStatus("microphone");
if (status === "granted") return true;
if (status === "denied" || status === "restricted") return false;
return await systemPreferences.askForMediaAccess("microphone");
}
function registerPermissionHandlers(): void {
session.defaultSession.setPermissionCheckHandler((webContents, permission, _origin, details) => (
permission === "media"
&& isTrustedPermissionRequest(webContents, details)
&& isAudioOnlyMediaRequest(details)
));
session.defaultSession.setPermissionRequestHandler((webContents, permission, callback, details) => {
if (
permission !== "media"
|| !isTrustedPermissionRequest(webContents, details)
|| !isAudioOnlyMediaRequest(details)
) {
callback(false);
return;
}
void requestNativeMicrophoneAccess().then(callback, () => callback(false));
});
}
function assertTrustedIpc(event: IpcMainInvokeEvent): void {
const frameUrl = event.senderFrame?.url || event.sender.getURL();
if (!isTrustedAppUrl(frameUrl)) {
@ -749,6 +802,7 @@ app.whenReady().then(async () => {
}
registerIpcHandlers();
registerPermissionHandlers();
registerAppProtocol(webDist, devUrl);
mainWindow = createWindow();

View File

@ -234,7 +234,7 @@ nanobot channels login <channel_name> --force # re-authenticate
| `_handle_message(sender_id, chat_id, content, media?, metadata?, session_key?)` | **Call this when you receive a message.** Checks `is_allowed()`, then publishes to the bus. Automatically sets `_wants_stream` if `supports_streaming` is true. |
| `is_allowed(sender_id)` | Checks against `config.allow_from`; `"*"` allows all, `[]` denies all. |
| `default_config()` (classmethod) | Returns default config dict for `nanobot onboard`. Override to declare your fields. |
| `transcribe_audio(file_path)` | Transcribes audio via Groq Whisper (if configured). |
| `transcribe_audio(file_path)` | Transcribes audio via the shared top-level `transcription` config (if configured). |
| `supports_streaming` (property) | `True` when config has `"streaming": true` **and** subclass overrides `send_delta()`. |
| `is_running` | Returns `self._running`. |
| `login(force=False)` | Perform interactive login (e.g. QR code scan). Returns `True` if already authenticated or login succeeds. Override in subclasses that support interactive login. |

View File

@ -119,7 +119,7 @@ ANTHROPIC_API_KEY="$(bw get password api/anthropic)" nanobot agent
## Providers
> [!TIP]
> - **Voice transcription**: Voice messages (Telegram, WhatsApp) are automatically transcribed using Whisper. By default Groq is used (free tier). Set `"transcriptionProvider": "openai"` under `channels` to use OpenAI Whisper instead, and optionally set `"transcriptionLanguage": "en"` (or another ISO-639-1 code) for more accurate transcription. The API key is picked from the matching provider config.
> - **Voice transcription**: Voice messages and WebUI/desktop microphone input use the shared top-level `transcription` settings. By default Groq Whisper is used; set `transcription.provider` to `"openai"` to use OpenAI Whisper. API keys still live in the matching `providers.<provider>` config.
> - **MiniMax Coding Plan**: Exclusive discount links for the nanobot community: [Overseas](https://platform.minimax.io/subscribe/coding-plan?code=9txpdXw04g&source=link) · [Mainland China](https://platform.minimaxi.com/subscribe/token-plan?code=GILTJpMTqZ&source=link)
> - **MiniMax (Mainland China)**: If your API key is from MiniMax's mainland China platform (minimaxi.com), set `"apiBase": "https://api.minimaxi.com/v1"` in your minimax provider config.
> - **MiniMax thinking mode**: Use `providers.minimaxAnthropic` when you want `reasoningEffort` / thinking mode. MiniMax exposes that capability through its Anthropic-compatible endpoint, so nanobot keeps it as a separate provider instead of guessing MiniMax-specific thinking parameters on the generic OpenAI-compatible `minimax` endpoint. It uses the same `MINIMAX_API_KEY`. Default Anthropic-compatible base URL: `https://api.minimax.io/anthropic`; for mainland China use `https://api.minimaxi.com/anthropic`.
@ -1100,6 +1100,61 @@ Set `agents.defaults.modelPreset` to start with a named preset:
When `modelPreset` is `null` or omitted, startup uses the implicit `default` preset from `agents.defaults.*`. Runtime changes made with `/model <preset>` are not written back to `config.json`; they affect future turns until the process restarts or another model/config change replaces them.
## Transcription Settings
Audio transcription is a shared capability used by chat-channel voice messages and by WebUI/desktop microphone input. Chat-channel voice messages are transcribed automatically before they enter the agent. WebUI and desktop microphone input is transcribed into the composer first, so you can edit the text before sending.
Configure transcription under the top-level `transcription` section:
```json
{
"transcription": {
"enabled": true,
"provider": "groq",
"model": null,
"language": null,
"maxDurationSec": 120,
"maxUploadMb": 25
}
}
```
| Setting | Default | Description |
|---------|---------|-------------|
| `enabled` | `true` | Enables audio transcription for both chat-channel voice messages and WebUI/desktop microphone input. |
| `provider` | `"groq"` | Transcription backend: `"groq"` or `"openai"`. |
| `model` | provider default | Optional transcription model override. Defaults to `whisper-large-v3` for Groq and `whisper-1` for OpenAI. |
| `language` | `null` | Optional ISO-639 language hint, e.g. `"en"`, `"zh"`, `"ko"`, or `"ja"`. |
| `maxDurationSec` | `120` | Maximum WebUI/desktop recording duration. |
| `maxUploadMb` | `25` | Maximum WebUI/desktop audio upload size. |
Provider and language resolution is intentionally ordered for backwards compatibility:
1. `transcription.provider` / `transcription.language`
2. Legacy `channels.transcriptionProvider` / `channels.transcriptionLanguage`
3. Built-in defaults (`provider: "groq"`, no language hint)
The legacy `channels.*` transcription fields existed before transcription became a shared capability across chat channels and WebUI/desktop microphone input. They are still read so older `config.json` files keep working, but they are no longer the preferred configuration surface. If both old and new fields are present, the top-level `transcription` values are the source of truth.
Transcription credentials are intentionally not stored in `transcription`. Put the API key and optional endpoint in the matching provider config:
```json
{
"providers": {
"groq": {
"apiKey": "gsk-...",
"apiBase": "https://api.groq.com/openai/v1"
}
},
"transcription": {
"provider": "groq",
"language": "zh"
}
}
```
Selecting a transcription provider does not configure credentials by itself. For example, the effective provider may default to Groq for compatibility, but transcription is only usable when `providers.groq.apiKey` or the matching environment-backed config is available. The Settings UI writes only the top-level `transcription` fields.
## Channel Settings
Global settings that apply to all channels. Configure under the `channels` section in `~/.nanobot/config.json`:
@ -1111,8 +1166,6 @@ Global settings that apply to all channels. Configure under the `channels` secti
"sendToolHints": false,
"extractDocumentText": true,
"sendMaxRetries": 3,
"transcriptionProvider": "groq",
"transcriptionLanguage": null,
"telegram": { ... }
}
}
@ -1125,8 +1178,8 @@ Global settings that apply to all channels. Configure under the `channels` secti
| `showReasoning` | `true` | Allow channels to surface model reasoning/thinking content (DeepSeek-R1 `reasoning_content`, Anthropic `thinking_blocks`, inline `<think>` tags). Reasoning flows as a dedicated stream with `_reasoning_delta` / `_reasoning_end` markers — channels override `send_reasoning_delta` / `send_reasoning_end` to render in-place updates. Even with `true`, channels without those overrides stay no-op silently. Currently surfaced on CLI and WebSocket/WebUI (italic shimmer header, auto-collapses after the stream ends); Telegram / Slack / Discord / Feishu / WeChat / Matrix keep the base no-op until their bubble UI is adapted. Independent of `sendProgress`. |
| `extractDocumentText` | `true` | Extract supported document/text attachments into the model prompt. Set to `false` to keep document content out of the prompt and include attachment path references instead. |
| `sendMaxRetries` | `3` | Max delivery attempts per outbound message, including the initial send (0-10 configured, minimum 1 actual attempt) |
| `transcriptionProvider` | `"groq"` | Voice transcription backend: `"groq"` (free tier, default) or `"openai"`. API key and optional `apiBase` are auto-resolved from the matching provider config. Chat-style bases such as `https://api.groq.com/openai/v1` are normalized to the audio transcription endpoint. |
| `transcriptionLanguage` | `null` | Optional ISO-639-1 language hint for audio transcription, e.g. `"en"`, `"ko"`, `"ja"`. |
`channels.transcriptionProvider` and `channels.transcriptionLanguage` are deprecated compatibility fields. They remain as a read-only fallback for older configs, but new configuration should use top-level `transcription.provider` and `transcription.language`.
`sendProgress` and `sendToolHints` can also be overridden per channel. The
global values stay as defaults for channels that do not set their own value:

View File

@ -24,6 +24,7 @@ DEFAULT_WAIT_FOR_MS = 10_000
MAX_WAIT_FOR_MS = 120_000
DEFAULT_MAX_OUTPUT_CHARS = 10_000
MAX_OUTPUT_CHARS = 50_000
OUTPUT_DRAIN_GRACE_S = 0.1
@dataclass(slots=True)
@ -139,6 +140,8 @@ class _ExecSession:
asyncio.gather(self._stdout_task, self._stderr_task),
timeout=2.0,
)
elif yield_time_ms > 0:
await self._wait_for_buffered_output()
async with self._lock:
output = "".join(self._chunks)
@ -163,6 +166,14 @@ class _ExecSession:
with suppress(asyncio.TimeoutError):
await asyncio.wait_for(self.process.wait(), timeout=5.0)
async def _wait_for_buffered_output(self) -> None:
deadline = time.monotonic() + OUTPUT_DRAIN_GRACE_S
while time.monotonic() < deadline:
async with self._lock:
if self._chunks:
return
await asyncio.sleep(0.01)
class ExecSessionManager:
def __init__(self, *, max_sessions: int = 8, idle_timeout: int = 1800) -> None:

View File

@ -0,0 +1,2 @@
"""Shared audio service helpers."""

View File

@ -0,0 +1,183 @@
"""Application-level audio transcription service.
This module owns nanobot's transcription behavior: config resolution,
legacy channel fallback, upload validation, temporary-file handling, and
dispatch to provider adapters. It deliberately does not know provider-specific
HTTP details; those live in ``nanobot.providers.transcription``.
"""
from __future__ import annotations
from contextlib import suppress
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Literal
from loguru import logger
from nanobot.config.paths import get_media_dir
from nanobot.utils.media_decode import FileSizeExceeded, save_base64_data_url
TranscriptionProviderName = Literal["groq", "openai"]
_DEFAULT_PROVIDER: TranscriptionProviderName = "groq"
_DEFAULT_MODELS: dict[TranscriptionProviderName, str] = {
"groq": "whisper-large-v3",
"openai": "whisper-1",
}
_MAX_AUDIO_BYTES_FALLBACK = 25 * 1024 * 1024
_AUDIO_MIME_ALLOWED: frozenset[str] = frozenset({
"audio/aac",
"audio/flac",
"audio/m4a",
"audio/mp4",
"audio/mpeg",
"audio/ogg",
"audio/wav",
"audio/webm",
"audio/x-m4a",
"audio/x-wav",
})
@dataclass(frozen=True)
class EffectiveTranscriptionConfig:
enabled: bool
provider: TranscriptionProviderName
model: str
language: str | None
api_key: str = field(repr=False)
api_base: str
max_duration_sec: int
max_upload_mb: int
@property
def configured(self) -> bool:
return bool(self.api_key)
class TranscriptionIngressError(Exception):
"""Stable transcription upload error surfaced to WebUI clients."""
def __init__(self, detail: str, **extra: Any):
super().__init__(detail)
self.detail = detail
self.extra = extra
def _as_provider(value: Any) -> TranscriptionProviderName | None:
if isinstance(value, str):
name = value.strip().lower()
if name in _DEFAULT_MODELS:
return name # type: ignore[return-value]
return None
def _provider_config(config: Any, provider: str) -> Any:
return getattr(getattr(config, "providers", None), provider, None)
def _extract_data_url_mime(url: str) -> str | None:
header, _, _ = url.partition(",")
if not header.startswith("data:") or ";base64" not in header:
return None
return header[5:].split(";", 1)[0].strip().lower() or None
def resolve_transcription_config(config: Any) -> EffectiveTranscriptionConfig:
"""Resolve top-level transcription settings with legacy channel fallback."""
top = getattr(config, "transcription", None)
channels = getattr(config, "channels", None)
provider = (
_as_provider(getattr(top, "provider", None))
or _as_provider(getattr(channels, "transcription_provider", None))
or _DEFAULT_PROVIDER
)
provider_cfg = _provider_config(config, provider)
return EffectiveTranscriptionConfig(
enabled=bool(getattr(top, "enabled", True)),
provider=provider,
model=(getattr(top, "model", None) or _DEFAULT_MODELS[provider]).strip(),
language=getattr(top, "language", None) or getattr(channels, "transcription_language", None),
api_key=getattr(provider_cfg, "api_key", None) or "",
api_base=getattr(provider_cfg, "api_base", None) or "",
max_duration_sec=int(getattr(top, "max_duration_sec", 120)),
max_upload_mb=int(getattr(top, "max_upload_mb", 25)),
)
async def transcribe_audio_data_url(
data_url: Any,
config: EffectiveTranscriptionConfig,
*,
duration_ms: Any = None,
) -> str:
"""Validate, persist, transcribe, and remove a WebUI audio data URL."""
if not isinstance(data_url, str) or not data_url:
raise TranscriptionIngressError("missing_audio")
if not config.enabled:
raise TranscriptionIngressError("disabled")
if not config.configured:
raise TranscriptionIngressError("not_configured", provider=config.provider)
if (
isinstance(duration_ms, (int, float))
and duration_ms > (config.max_duration_sec * 1000 + 1000)
):
raise TranscriptionIngressError("duration")
if _extract_data_url_mime(data_url) not in _AUDIO_MIME_ALLOWED:
raise TranscriptionIngressError("mime")
audio_path: str | None = None
max_bytes = max(
1,
config.max_upload_mb * 1024 * 1024 if config.max_upload_mb else _MAX_AUDIO_BYTES_FALLBACK,
)
try:
audio_path = save_base64_data_url(
data_url,
get_media_dir("webui-transcription"),
max_bytes=max_bytes,
)
except FileSizeExceeded as exc:
raise TranscriptionIngressError("size") from exc
except Exception as exc:
logger.warning("transcription audio decode failed: {}", exc)
if not audio_path:
raise TranscriptionIngressError("decode")
try:
text = await transcribe_audio_file(audio_path, config)
finally:
with suppress(OSError):
Path(audio_path).unlink(missing_ok=True)
if not text:
raise TranscriptionIngressError("empty")
return text
async def transcribe_audio_file(
file_path: str | Path,
config: EffectiveTranscriptionConfig,
) -> str:
"""Transcribe *file_path* using the already-resolved transcription config."""
if not config.enabled or not config.configured:
return ""
if config.provider == "openai":
from nanobot.providers.transcription import OpenAITranscriptionProvider
provider = OpenAITranscriptionProvider(
api_key=config.api_key,
api_base=config.api_base or None,
language=config.language,
model=config.model,
)
else:
from nanobot.providers.transcription import GroqTranscriptionProvider
provider = GroqTranscriptionProvider(
api_key=config.api_key,
api_base=config.api_base or None,
language=config.language,
model=config.model,
)
return await provider.transcribe(file_path)

View File

@ -28,10 +28,6 @@ class BaseChannel(ABC):
name: str = "base"
display_name: str = "Base"
transcription_provider: str = "groq"
transcription_api_key: str = ""
transcription_api_base: str = ""
transcription_language: str | None = None
send_progress: bool = True
send_tool_hints: bool = False
show_reasoning: bool = True
@ -51,24 +47,14 @@ class BaseChannel(ABC):
async def transcribe_audio(self, file_path: str | Path) -> str:
"""Transcribe an audio file via Whisper (OpenAI or Groq). Returns empty string on failure."""
if not self.transcription_api_key:
return ""
try:
if self.transcription_provider == "openai":
from nanobot.providers.transcription import OpenAITranscriptionProvider
provider = OpenAITranscriptionProvider(
api_key=self.transcription_api_key,
api_base=self.transcription_api_base or None,
language=self.transcription_language or None,
)
else:
from nanobot.providers.transcription import GroqTranscriptionProvider
provider = GroqTranscriptionProvider(
api_key=self.transcription_api_key,
api_base=self.transcription_api_base or None,
language=self.transcription_language or None,
)
return await provider.transcribe(file_path)
from nanobot.audio.transcription import (
resolve_transcription_config,
transcribe_audio_file,
)
from nanobot.config.loader import load_config
return await transcribe_audio_file(file_path, resolve_transcription_config(load_config()))
except Exception:
self.logger.exception("Audio transcription failed")
return ""

View File

@ -80,11 +80,6 @@ class ChannelManager:
"""Initialize channels discovered via pkgutil scan + entry_points plugins."""
from nanobot.channels.registry import discover_channel_names, discover_enabled
transcription_provider = self.config.channels.transcription_provider
transcription_key = self._resolve_transcription_key(transcription_provider)
transcription_base = self._resolve_transcription_base(transcription_provider)
transcription_language = self.config.channels.transcription_language
# Collect enabled module names first, then only import those.
# Channel configs live in ChannelsConfig's extra fields (via
# extra="allow"), so we enumerate candidates from pkgutil scan
@ -135,10 +130,6 @@ class ChannelManager:
)
kwargs["gateway"] = gateway
channel = cls(section, self.bus, **kwargs)
channel.transcription_provider = transcription_provider
channel.transcription_api_key = transcription_key
channel.transcription_api_base = transcription_base
channel.transcription_language = transcription_language
channel.send_progress = self._resolve_bool_override(
section, "send_progress", self.config.channels.send_progress,
)
@ -155,24 +146,6 @@ class ChannelManager:
self._validate_allow_from()
def _resolve_transcription_key(self, provider: str) -> str:
"""Pick the API key for the configured transcription provider."""
try:
if provider == "openai":
return self.config.providers.openai.api_key
return self.config.providers.groq.api_key
except AttributeError:
return ""
def _resolve_transcription_base(self, provider: str) -> str:
"""Pick the API base URL for the configured transcription provider."""
try:
if provider == "openai":
return self.config.providers.openai.api_base or ""
return self.config.providers.groq.api_base or ""
except AttributeError:
return ""
def _validate_allow_from(self) -> None:
for name, ch in self.channels.items():
cfg = ch.config

View File

@ -45,6 +45,7 @@ from nanobot.webui.http_utils import (
query_first as _query_first,
)
from nanobot.webui.mcp_presets_api import normalize_mcp_preset_mentions
from nanobot.webui.transcription_ws import webui_transcription_event
from nanobot.webui.websocket_logging import websockets_server_logger
@ -235,7 +236,7 @@ _VIDEO_MIME_ALLOWED: frozenset[str] = frozenset({
_UPLOAD_MIME_ALLOWED: frozenset[str] = _IMAGE_MIME_ALLOWED | _VIDEO_MIME_ALLOWED
_DATA_URL_MIME_RE = re.compile(r"^data:([^;]+);base64,", re.DOTALL)
_DATA_URL_MIME_RE = re.compile(r"^data:([^;,]+)(?:;[^,]*)*;base64,", re.DOTALL)
def _extract_data_url_mime(url: str) -> str | None:
@ -419,7 +420,6 @@ class WebSocketChannel(BaseChannel):
return None
# -- Server lifecycle and connection ingress ---------------------------
# -- Server lifecycle and connection ingress ---------------------------
async def start(self) -> None:
from nanobot.utils.logging_bridge import redirect_lib_logging
@ -703,6 +703,10 @@ class WebSocketChannel(BaseChannel):
workspace_scope=scope.payload(),
)
return
if t == "transcribe_audio":
event, payload = await webui_transcription_event(envelope)
await self._send_event(connection, event, **payload)
return
if t == "message":
cid = envelope.get("chat_id")
content = envelope.get("content")

View File

@ -39,8 +39,19 @@ class ChannelsConfig(Base):
show_reasoning: bool = True # surface model reasoning when channel implements it
extract_document_text: bool = True # extract text from document attachments before sending to the model
send_max_retries: int = Field(default=3, ge=0, le=10) # Max delivery attempts (initial send included)
transcription_provider: str = "groq" # Voice transcription backend: "groq" or "openai"
transcription_language: str | None = Field(default=None, pattern=r"^[a-z]{2,3}$") # Optional ISO-639-1 hint for audio transcription
transcription_provider: str = "groq" # Deprecated: use top-level transcription.provider
transcription_language: str | None = Field(default=None, pattern=r"^[a-z]{2,3}$") # Deprecated: use top-level transcription.language
class TranscriptionConfig(Base):
"""Cross-channel audio transcription configuration."""
enabled: bool = True
provider: Literal["groq", "openai"] | None = None
model: str | None = None
language: str | None = Field(default=None, pattern=r"^[a-z]{2,3}$")
max_duration_sec: int = Field(default=120, ge=1, le=600)
max_upload_mb: int = Field(default=25, ge=1, le=100)
class DreamConfig(Base):
@ -167,7 +178,7 @@ class AgentsConfig(Base):
class ProviderConfig(Base):
"""LLM provider configuration."""
api_key: str | None = None
api_key: str | None = Field(default=None, repr=False)
api_base: str | None = None
api_type: Literal["auto", "chat_completions", "responses"] = "auto" # Request API surface
extra_headers: dict[str, str] | None = None # Custom headers (e.g. APP-Code for AiHubMix)
@ -312,6 +323,7 @@ class Config(BaseSettings):
agents: AgentsConfig = Field(default_factory=AgentsConfig)
channels: ChannelsConfig = Field(default_factory=ChannelsConfig)
transcription: TranscriptionConfig = Field(default_factory=TranscriptionConfig)
providers: ProvidersConfig = Field(default_factory=ProvidersConfig)
api: ApiConfig = Field(default_factory=ApiConfig)
gateway: GatewayConfig = Field(default_factory=GatewayConfig)

View File

@ -1,6 +1,12 @@
"""Voice transcription providers (Groq and OpenAI Whisper)."""
"""Provider-specific voice transcription adapters.
This module only knows how to call external transcription APIs such as Groq
and OpenAI Whisper. Product-level config fallback, WebUI upload validation,
and channel integration live in ``nanobot.audio.transcription``.
"""
import asyncio
import mimetypes
import os
from pathlib import Path
@ -8,6 +14,15 @@ import httpx
from loguru import logger
_TRANSCRIPTIONS_PATH = "audio/transcriptions"
_AUDIO_MIME_OVERRIDES = {
".m4a": "audio/mp4",
".mpga": "audio/mpeg",
".ogg": "audio/ogg",
".opus": "audio/ogg",
".wav": "audio/wav",
".weba": "audio/webm",
".webm": "audio/webm",
}
def _resolve_transcription_url(api_base: str | None, default_url: str) -> str:
@ -26,6 +41,14 @@ def _resolve_transcription_url(api_base: str | None, default_url: str) -> str:
return f"{base}/{_TRANSCRIPTIONS_PATH}"
def _audio_mime_type(path: Path) -> str:
return (
_AUDIO_MIME_OVERRIDES.get(path.suffix.lower())
or mimetypes.guess_type(path.name)[0]
or "application/octet-stream"
)
# Up to 3 retries (4 attempts total) with exponential backoff on transient
# failures. Whisper endpoints occasionally return 502/503 under load, and
# mobile-network transcription callers hit sporadic connect/read errors.
@ -71,7 +94,7 @@ async def _post_transcription_with_retry(
async with httpx.AsyncClient() as client:
for attempt in range(_MAX_RETRIES + 1):
files = {
"file": (path.name, data),
"file": (path.name, data, _audio_mime_type(path)),
"model": (None, model),
}
if language:
@ -113,6 +136,16 @@ async def _post_transcription_with_retry(
try:
response.raise_for_status()
except httpx.HTTPStatusError:
body = response.text.strip().replace("\n", " ")[:500]
logger.error(
"{} transcription HTTP {}{}{}",
provider_label,
response.status_code,
f" {response.reason_phrase}" if response.reason_phrase else "",
f": {body}" if body else "",
)
return ""
except Exception as e:
logger.exception("{} transcription error: {}", provider_label, e)
return ""
@ -144,6 +177,7 @@ class OpenAITranscriptionProvider:
api_key: str | None = None,
api_base: str | None = None,
language: str | None = None,
model: str | None = None,
):
self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
self.api_url = _resolve_transcription_url(
@ -151,6 +185,7 @@ class OpenAITranscriptionProvider:
"https://api.openai.com/v1/audio/transcriptions",
)
self.language = language or None
self.model = model or "whisper-1"
logger.debug("OpenAI transcription endpoint: {}", self.api_url)
async def transcribe(self, file_path: str | Path) -> str:
@ -165,7 +200,7 @@ class OpenAITranscriptionProvider:
self.api_url,
api_key=self.api_key,
path=path,
model="whisper-1",
model=self.model,
provider_label="OpenAI",
language=self.language,
)
@ -183,6 +218,7 @@ class GroqTranscriptionProvider:
api_key: str | None = None,
api_base: str | None = None,
language: str | None = None,
model: str | None = None,
):
self.api_key = api_key or os.environ.get("GROQ_API_KEY")
self.api_url = _resolve_transcription_url(
@ -190,6 +226,7 @@ class GroqTranscriptionProvider:
"https://api.groq.com/openai/v1/audio/transcriptions",
)
self.language = language or None
self.model = model or "whisper-large-v3"
logger.debug("Groq transcription endpoint: {}", self.api_url)
async def transcribe(self, file_path: str | Path) -> str:
@ -215,7 +252,7 @@ class GroqTranscriptionProvider:
self.api_url,
api_key=self.api_key,
path=path,
model="whisper-large-v3",
model=self.model,
provider_label="Groq",
language=self.language,
)

View File

@ -18,13 +18,30 @@ from nanobot.utils.helpers import safe_filename
DEFAULT_MAX_BYTES = 10 * 1024 * 1024
MAX_FILE_SIZE = DEFAULT_MAX_BYTES
_DATA_URL_RE = re.compile(r"^data:([^;]+);base64,(.+)$", re.DOTALL)
_DATA_URL_RE = re.compile(r"^data:([^;,]+)(?:;[^,]*)*;base64,(.+)$", re.DOTALL)
_MIME_EXTENSION_OVERRIDES = {
# Python's ``mimetypes`` maps browser-recorded audio/webm to ``.weba`` and
# audio/ogg to ``.oga`` on macOS. Some transcription APIs validate by the
# file extension and accept the canonical container extensions instead.
"application/ogg": ".ogg",
"audio/ogg": ".ogg",
"audio/mpga": ".mpga",
"audio/wav": ".wav",
"audio/webm": ".webm",
"audio/x-m4a": ".m4a",
"audio/x-wav": ".wav",
"audio/vnd.wave": ".wav",
"video/webm": ".webm",
}
class FileSizeExceeded(Exception):
class FileSizeExceededError(Exception):
"""Raised when a decoded payload exceeds the caller's size limit."""
FileSizeExceeded = FileSizeExceededError
def save_base64_data_url(
data_url: str,
media_dir: Path,
@ -40,7 +57,7 @@ def save_base64_data_url(
m = _DATA_URL_RE.match(data_url)
if not m:
return None
mime_type, b64_payload = m.group(1), m.group(2)
mime_type, b64_payload = m.group(1).strip().lower(), m.group(2)
try:
raw = base64.b64decode(b64_payload)
except Exception:
@ -48,7 +65,7 @@ def save_base64_data_url(
limit = DEFAULT_MAX_BYTES if max_bytes is None else max_bytes
if len(raw) > limit:
raise FileSizeExceeded(f"File exceeds {limit // (1024 * 1024)}MB limit")
ext = mimetypes.guess_extension(mime_type) or ".bin"
ext = _MIME_EXTENSION_OVERRIDES.get(mime_type) or mimetypes.guess_extension(mime_type) or ".bin"
filename = f"{uuid.uuid4().hex[:12]}{ext}"
dest = media_dir / safe_filename(filename)
dest.write_bytes(raw)

View File

@ -15,6 +15,7 @@ from zoneinfo import ZoneInfo
import httpx
from nanobot.audio.transcription import resolve_transcription_config
from nanobot.config.loader import get_config_path, load_config, save_config
from nanobot.config.schema import ModelPresetConfig
from nanobot.providers.image_generation import (
@ -90,6 +91,7 @@ _IMAGE_GENERATION_ASPECT_RATIOS = {
"2:3",
"21:9",
}
_TRANSCRIPTION_PROVIDERS = ("groq", "openai")
_CONTEXT_WINDOW_TOKEN_OPTIONS = {65_536, 262_144}
_MODEL_CONFIGURATION_SLUG_RE = re.compile(r"[^a-z0-9_-]+")
_ENV_REF_RE = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}")
@ -576,6 +578,22 @@ def _image_generation_provider_rows(config: Any) -> list[dict[str, Any]]:
return rows
def _transcription_provider_rows(config: Any) -> list[dict[str, Any]]:
rows: list[dict[str, Any]] = []
for name in _TRANSCRIPTION_PROVIDERS:
spec = find_by_name(name)
provider_config = getattr(config.providers, name, None)
rows.append({
"name": name,
"label": spec.label if spec is not None else name,
"configured": bool(getattr(provider_config, "api_key", None)),
"api_key_hint": _mask_secret_hint(getattr(provider_config, "api_key", None)),
"api_base": getattr(provider_config, "api_base", None),
"default_api_base": spec.default_api_base if spec and spec.default_api_base else None,
})
return rows
def settings_payload(
*,
requires_restart: bool = False,
@ -633,6 +651,7 @@ def settings_payload(
search_config = config.tools.web.search
image_config = config.tools.image_generation
transcription = resolve_transcription_config(config)
search_provider = (
search_config.provider
if search_config.provider in _WEB_SEARCH_PROVIDER_BY_NAME
@ -733,6 +752,16 @@ def settings_payload(
"save_dir": image_config.save_dir,
"providers": image_providers,
},
"transcription": {
"enabled": transcription.enabled,
"provider": transcription.provider,
"provider_configured": transcription.configured,
"model": transcription.model,
"language": transcription.language,
"max_duration_sec": transcription.max_duration_sec,
"max_upload_mb": transcription.max_upload_mb,
"providers": _transcription_provider_rows(config),
},
"runtime": {
"config_path": str(get_config_path().expanduser()),
"workspace_path": str(config.workspace_path),
@ -1311,3 +1340,71 @@ def update_image_generation_settings(query: QueryParams) -> dict[str, Any]:
if changed:
save_config(config)
return settings_payload(requires_restart=changed)
def update_transcription_settings(query: QueryParams) -> dict[str, Any]:
config = load_config()
transcription = config.transcription
changed = False
enabled = _query_first(query, "enabled")
if enabled is not None:
parsed_enabled = _parse_bool(enabled, "enabled")
if transcription.enabled != parsed_enabled:
transcription.enabled = parsed_enabled
changed = True
provider = _query_first(query, "provider")
if provider is not None:
provider = provider.strip().lower()
if provider not in _TRANSCRIPTION_PROVIDERS:
raise WebUISettingsError("unknown transcription provider")
if transcription.provider != provider:
transcription.provider = provider # type: ignore[assignment]
changed = True
model = _query_first(query, "model")
if model is not None:
model = model.strip() or None
if model is not None and len(model) > 200:
raise WebUISettingsError("transcription model is too long")
if transcription.model != model:
transcription.model = model
changed = True
language = _query_first(query, "language")
if language is not None:
language = language.strip().lower() or None
if language is not None and not re.fullmatch(r"[a-z]{2,3}", language):
raise WebUISettingsError("transcription language must be 2-3 lowercase letters")
if transcription.language != language:
transcription.language = language
changed = True
max_duration_sec = _query_first_alias(query, "max_duration_sec", "maxDurationSec")
if max_duration_sec is not None:
try:
parsed_duration = int(max_duration_sec)
except ValueError:
raise WebUISettingsError("max_duration_sec must be an integer") from None
if parsed_duration < 1 or parsed_duration > 600:
raise WebUISettingsError("max_duration_sec must be between 1 and 600")
if transcription.max_duration_sec != parsed_duration:
transcription.max_duration_sec = parsed_duration
changed = True
max_upload_mb = _query_first_alias(query, "max_upload_mb", "maxUploadMb")
if max_upload_mb is not None:
try:
parsed_upload = int(max_upload_mb)
except ValueError:
raise WebUISettingsError("max_upload_mb must be an integer") from None
if parsed_upload < 1 or parsed_upload > 100:
raise WebUISettingsError("max_upload_mb must be between 1 and 100")
if transcription.max_upload_mb != parsed_upload:
transcription.max_upload_mb = parsed_upload
changed = True
if changed:
save_config(config)
return settings_payload()

View File

@ -33,6 +33,7 @@ from nanobot.webui.settings_api import (
update_model_configuration,
update_network_safety_settings,
update_provider_settings,
update_transcription_settings,
update_web_search_settings,
)
@ -100,6 +101,8 @@ class WebUISettingsRouter:
return self._handle_settings_web_search_update(request)
if path == "/api/settings/image-generation/update":
return self._handle_settings_image_generation_update(request)
if path == "/api/settings/transcription/update":
return self._handle_settings_transcription_update(request)
if path == "/api/settings/network-safety/update":
return self._handle_settings_network_safety_update(request)
if path == "/api/settings/cli-apps":
@ -275,6 +278,15 @@ class WebUISettingsRouter:
return self._error_response(e.status, e.message)
return self._json_response(self._with_restart_state(payload, section="image"))
def _handle_settings_transcription_update(self, request: WsRequest) -> Response:
if not self._authorized(request):
return self._unauthorized()
try:
payload = update_transcription_settings(self._query(request))
except WebUISettingsError as e:
return self._error_response(e.status, e.message)
return self._json_response(self._with_restart_state(payload))
def _handle_settings_network_safety_update(self, request: WsRequest) -> Response:
if not self._authorized(request):
return self._unauthorized()

View File

@ -0,0 +1,46 @@
"""WebUI transcription envelope handling.
The WebSocket channel owns transport and subscription fan-out. This module owns
the WebUI-specific audio transcription action carried over that socket.
"""
from __future__ import annotations
from typing import Any
from nanobot.audio.transcription import (
TranscriptionIngressError,
resolve_transcription_config,
transcribe_audio_data_url,
)
from nanobot.config.loader import load_config
_MAX_REQUEST_ID_LENGTH = 80
async def webui_transcription_event(envelope: dict[str, Any]) -> tuple[str, dict[str, Any]]:
"""Return the WS event name and payload for one WebUI transcription request."""
request_id = envelope.get("request_id")
valid_request_id = (
isinstance(request_id, str)
and 0 < len(request_id) <= _MAX_REQUEST_ID_LENGTH
)
def error(detail: str, **extra: Any) -> tuple[str, dict[str, Any]]:
payload: dict[str, Any] = {"detail": detail, **extra}
if valid_request_id:
payload["request_id"] = request_id
return "transcription_error", payload
if not valid_request_id:
return error("invalid_request")
try:
text = await transcribe_audio_data_url(
envelope.get("data_url"),
resolve_transcription_config(load_config()),
duration_ms=envelope.get("duration_ms"),
)
except TranscriptionIngressError as exc:
return error(exc.detail, **exc.extra)
return "transcription_result", {"request_id": request_id, "text": text}

View File

@ -12,7 +12,8 @@ from nanobot.bus.events import OutboundMessage
from nanobot.bus.queue import MessageBus
from nanobot.channels.base import BaseChannel
from nanobot.channels.manager import ChannelManager
from nanobot.config.schema import ChannelsConfig
from nanobot.config.loader import save_config
from nanobot.config.schema import ChannelsConfig, Config
from nanobot.providers.transcription import GroqTranscriptionProvider as _GroqProvider
from nanobot.providers.transcription import OpenAITranscriptionProvider as _OpenAIProvider
from nanobot.utils.restart import RestartNotice
@ -238,102 +239,103 @@ async def test_manager_loads_plugin_from_dict_config():
@pytest.mark.asyncio
async def test_manager_propagates_groq_transcription_api_base_to_channels():
from nanobot.channels.manager import ChannelManager
fake_config = SimpleNamespace(
channels=ChannelsConfig.model_validate({
"fakeplugin": {"enabled": True, "allowFrom": ["*"]},
"transcriptionLanguage": "en",
}),
providers=SimpleNamespace(
groq=SimpleNamespace(api_key="groq-key", api_base="http://proxy.local/v1/audio/transcriptions"),
openai=SimpleNamespace(api_key="openai-key", api_base="https://api.openai.com/v1/audio/transcriptions"),
),
)
with patch(
"nanobot.channels.registry.discover_enabled",
return_value={"fakeplugin": _FakePlugin},
):
mgr = ChannelManager.__new__(ChannelManager)
mgr.config = fake_config
mgr.bus = MessageBus()
mgr.channels = {}
mgr._dispatch_task = None
mgr._init_channels()
channel = mgr.channels["fakeplugin"]
assert channel.transcription_provider == "groq"
assert channel.transcription_api_key == "groq-key"
assert channel.transcription_api_base == "http://proxy.local/v1/audio/transcriptions"
assert channel.transcription_language == "en"
@pytest.mark.asyncio
async def test_manager_propagates_openai_transcription_api_base_to_channels():
from nanobot.channels.manager import ChannelManager
fake_config = SimpleNamespace(
channels=ChannelsConfig.model_validate({
"fakeplugin": {"enabled": True, "allowFrom": ["*"]},
"transcriptionProvider": "openai",
}),
providers=SimpleNamespace(
openai=SimpleNamespace(
api_key="openai-key",
api_base="http://proxy.local/v1/audio/transcriptions",
),
groq=SimpleNamespace(api_key="groq-key", api_base=""),
),
)
with patch(
"nanobot.channels.registry.discover_enabled",
return_value={"fakeplugin": _FakePlugin},
):
mgr = ChannelManager.__new__(ChannelManager)
mgr.config = fake_config
mgr.bus = MessageBus()
mgr.channels = {}
mgr._dispatch_task = None
mgr._init_channels()
channel = mgr.channels["fakeplugin"]
assert channel.transcription_provider == "openai"
assert channel.transcription_api_key == "openai-key"
assert channel.transcription_api_base == "http://proxy.local/v1/audio/transcriptions"
@pytest.mark.asyncio
async def test_base_channel_passes_api_base_to_openai_transcription_provider():
"""BaseChannel.transcribe_audio must forward transcription_api_base to OpenAI."""
async def test_base_channel_reads_current_transcription_config_each_call(
tmp_path,
monkeypatch: pytest.MonkeyPatch,
):
"""BaseChannel.transcribe_audio resolves config at call time, not manager init time."""
from nanobot.providers import transcription as transcription_mod
channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus())
channel.transcription_provider = "openai"
channel.transcription_api_key = "k"
channel.transcription_api_base = "http://override/v1/audio/transcriptions"
channel.transcription_language = "en"
config_path = tmp_path / "config.json"
config = Config()
config.transcription.provider = "openai"
config.transcription.model = "whisper-custom"
config.transcription.language = "en"
config.providers.openai.api_key = "openai-key"
config.providers.openai.api_base = "http://openai.local/v1/audio/transcriptions"
save_config(config, config_path)
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
captured: dict[str, object] = {}
channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus())
calls: list[dict[str, object]] = []
class _StubOpenAI:
def __init__(self, api_key=None, api_base=None, language=None):
captured["api_key"] = api_key
captured["api_base"] = api_base
captured["language"] = language
def __init__(self, api_key=None, api_base=None, language=None, model=None):
calls.append({
"provider": "openai",
"api_key": api_key,
"api_base": api_base,
"language": language,
"model": model,
})
async def transcribe(self, file_path):
return "ok"
return "openai-ok"
with patch.object(transcription_mod, "OpenAITranscriptionProvider", _StubOpenAI):
result = await channel.transcribe_audio("/tmp/does-not-matter.wav")
class _StubGroq:
def __init__(self, api_key=None, api_base=None, language=None, model=None):
calls.append({
"provider": "groq",
"api_key": api_key,
"api_base": api_base,
"language": language,
"model": model,
})
assert result == "ok"
assert captured["api_key"] == "k"
assert captured["api_base"] == "http://override/v1/audio/transcriptions"
assert captured["language"] == "en"
async def transcribe(self, file_path):
return "groq-ok"
with (
patch.object(transcription_mod, "OpenAITranscriptionProvider", _StubOpenAI),
patch.object(transcription_mod, "GroqTranscriptionProvider", _StubGroq),
):
assert await channel.transcribe_audio("/tmp/does-not-matter.wav") == "openai-ok"
config.transcription.provider = "groq"
config.transcription.model = "whisper-large-v3-turbo"
config.transcription.language = "ko"
config.providers.groq.api_key = "groq-key"
config.providers.groq.api_base = "http://groq.local/v1/audio/transcriptions"
save_config(config, config_path)
assert await channel.transcribe_audio("/tmp/does-not-matter.wav") == "groq-ok"
assert calls == [
{
"provider": "openai",
"api_key": "openai-key",
"api_base": "http://openai.local/v1/audio/transcriptions",
"language": "en",
"model": "whisper-custom",
},
{
"provider": "groq",
"api_key": "groq-key",
"api_base": "http://groq.local/v1/audio/transcriptions",
"language": "ko",
"model": "whisper-large-v3-turbo",
},
]
@pytest.mark.asyncio
async def test_base_channel_respects_disabled_transcription_config(
tmp_path,
monkeypatch: pytest.MonkeyPatch,
):
config_path = tmp_path / "config.json"
config = Config()
config.transcription.enabled = False
config.providers.groq.api_key = "groq-key"
save_config(config, config_path)
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus())
with patch("nanobot.providers.transcription.GroqTranscriptionProvider") as provider:
assert await channel.transcribe_audio("/tmp/does-not-matter.wav") == ""
provider.assert_not_called()
def test_openai_transcription_provider_honors_api_base_argument():
@ -348,37 +350,6 @@ def test_openai_transcription_provider_honors_api_base_argument():
assert custom.api_url == "http://override/v1/audio/transcriptions"
@pytest.mark.asyncio
async def test_base_channel_passes_language_to_groq_transcription_provider():
"""BaseChannel.transcribe_audio must forward transcription_language to Groq."""
from nanobot.providers import transcription as transcription_mod
channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus())
channel.transcription_provider = "groq"
channel.transcription_api_key = "k"
channel.transcription_api_base = "http://override/v1/audio/transcriptions"
channel.transcription_language = "ko"
captured: dict[str, object] = {}
class _StubGroq:
def __init__(self, api_key=None, api_base=None, language=None):
captured["api_key"] = api_key
captured["api_base"] = api_base
captured["language"] = language
async def transcribe(self, file_path):
return "ok"
with patch.object(transcription_mod, "GroqTranscriptionProvider", _StubGroq):
result = await channel.transcribe_audio("/tmp/does-not-matter.wav")
assert result == "ok"
assert captured["api_key"] == "k"
assert captured["api_base"] == "http://override/v1/audio/transcriptions"
assert captured["language"] == "ko"
# ---------------------------------------------------------------------------
# Transcription provider HTTP tests
# ---------------------------------------------------------------------------

View File

@ -69,6 +69,7 @@ def _make_channel() -> WebSocketChannel:
[
("data:image/png;base64,AAAA", "image/png"),
("data:image/jpeg;base64,AAAA", "image/jpeg"),
("data:audio/webm;codecs=opus;base64,AAAA", "audio/webm"),
("data:IMAGE/PNG;base64,AAAA", "image/png"),
("data:image/svg+xml;base64,AAAA", "image/svg+xml"),
("data:text/plain;base64,AAAA", "text/plain"),

View File

@ -271,8 +271,6 @@ async def test_lid_to_phone_cache_resolves_lid_only_messages():
async def test_voice_message_transcription_uses_media_path():
"""Voice messages are transcribed when media path is available."""
ch = WhatsAppChannel({"enabled": True, "allowFrom": ["*"]}, MagicMock())
ch.transcription_provider = "openai"
ch.transcription_api_key = "sk-test"
ch._handle_message = AsyncMock()
ch.transcribe_audio = AsyncMock(return_value="Hello world")

View File

@ -8,6 +8,8 @@ from unittest.mock import AsyncMock, patch
import httpx
import pytest
from nanobot.audio.transcription import resolve_transcription_config
from nanobot.config.schema import Config
from nanobot.providers.transcription import (
GroqTranscriptionProvider,
OpenAITranscriptionProvider,
@ -33,6 +35,65 @@ def _raw_response(status: int, content: bytes) -> httpx.Response:
return httpx.Response(status_code=status, content=content, request=request)
def test_resolver_uses_legacy_channel_provider_when_top_level_is_unset() -> None:
config = Config()
config.channels.transcription_provider = "openai"
config.channels.transcription_language = "en"
config.providers.openai.api_key = "sk-test"
config.providers.openai.api_base = "https://proxy.example/v1"
resolved = resolve_transcription_config(config)
assert resolved.provider == "openai"
assert resolved.model == "whisper-1"
assert resolved.language == "en"
assert resolved.api_key == "sk-test"
assert resolved.api_base == "https://proxy.example/v1"
assert resolved.configured is True
def test_resolver_prefers_top_level_transcription_over_legacy_channels() -> None:
config = Config()
config.channels.transcription_provider = "openai"
config.channels.transcription_language = "en"
config.transcription.provider = "groq"
config.transcription.model = "whisper-large-v3-turbo"
config.transcription.language = "ko"
config.providers.groq.api_key = "gsk-test"
config.providers.groq.api_base = "https://groq.example/openai/v1"
resolved = resolve_transcription_config(config)
assert resolved.provider == "groq"
assert resolved.model == "whisper-large-v3-turbo"
assert resolved.language == "ko"
assert resolved.api_key == "gsk-test"
assert resolved.api_base == "https://groq.example/openai/v1"
def test_resolved_transcription_repr_hides_api_key() -> None:
config = Config()
config.providers.groq.api_key = "gsk-secret"
resolved = resolve_transcription_config(config)
assert "gsk-secret" not in repr(resolved)
assert "api_key" not in repr(resolved)
def test_resolver_keeps_enabled_and_limits_on_effective_config() -> None:
config = Config()
config.transcription.enabled = False
config.transcription.max_duration_sec = 45
config.transcription.max_upload_mb = 12
resolved = resolve_transcription_config(config)
assert resolved.enabled is False
assert resolved.max_duration_sec == 45
assert resolved.max_upload_mb == 12
# ---------------------------------------------------------------------------
# OpenAI provider — retry on transient HTTP + network errors
# ---------------------------------------------------------------------------
@ -215,6 +276,32 @@ async def test_provider_omits_language_when_unset(
assert "language" not in files
@pytest.mark.asyncio
async def test_provider_forwards_custom_model_in_multipart(audio_file: Path) -> None:
provider = GroqTranscriptionProvider(api_key="k", model="whisper-large-v3-turbo")
post = AsyncMock(return_value=_response(200, {"text": "ok"}))
with patch("httpx.AsyncClient.post", post), patch("asyncio.sleep", AsyncMock()):
result = await provider.transcribe(audio_file)
assert result == "ok"
files = post.await_args_list[0].kwargs["files"]
assert files["model"] == (None, "whisper-large-v3-turbo")
@pytest.mark.asyncio
async def test_provider_forwards_file_mime_type(tmp_path: Path) -> None:
audio = tmp_path / "voice.webm"
audio.write_bytes(b"audio")
provider = GroqTranscriptionProvider(api_key="k")
post = AsyncMock(return_value=_response(200, {"text": "ok"}))
with patch("httpx.AsyncClient.post", post), patch("asyncio.sleep", AsyncMock()):
result = await provider.transcribe(audio)
assert result == "ok"
files = post.await_args_list[0].kwargs["files"]
assert files["file"] == ("voice.webm", b"audio", "audio/webm")
@pytest.mark.asyncio
async def test_language_survives_retry(audio_file: Path) -> None:
"""Regression: language must be present on every retry attempt, not just the first."""

View File

@ -6,8 +6,12 @@ import shlex
import subprocess
import sys
from nanobot.agent.tools.exec_session import (
ExecSessionManager,
ListExecSessionsTool,
WriteStdinTool,
)
from nanobot.agent.tools.shell import ExecTool
from nanobot.agent.tools.exec_session import ExecSessionManager, ListExecSessionsTool, WriteStdinTool
def _python_command(code: str) -> str:
@ -141,7 +145,7 @@ def test_exec_can_continue_with_stdin(tmp_path):
return initial, result
initial, result = asyncio.run(run())
assert "ready" in initial
assert "ready" in initial + result
assert "Process running" in initial
assert "Elapsed:" in initial
assert "got:ping" in result
@ -170,7 +174,7 @@ def test_write_stdin_can_close_stdin(tmp_path):
return initial, result
initial, result = asyncio.run(run())
assert "ready" in initial
assert "ready" in initial + result
assert "got:payload" in result
assert "Stdin closed." in result
assert "Exit code: 0" in result
@ -185,14 +189,20 @@ def test_write_stdin_can_terminate_session(tmp_path):
"import time; print('ready', flush=True); time.sleep(30)"
)
initial = await exec_tool.execute(command=command, yield_time_ms=500)
initial = await exec_tool.execute(command=command, yield_time_ms=100)
sid = _session_id(initial)
waited = await stdin_tool.execute(
session_id=sid,
wait_for="ready",
wait_timeout_ms=3000,
yield_time_ms=0,
)
result = await stdin_tool.execute(
session_id=sid,
terminate=True,
yield_time_ms=0,
)
return initial, result
return initial + waited, result
initial, result = asyncio.run(run())
assert "ready" in initial
@ -243,7 +253,7 @@ def test_write_stdin_preserves_completed_session_output_until_polled(tmp_path):
initial, final = asyncio.run(run())
assert "ready" in initial
assert "ready" in initial + final
assert "done" in final
assert "Exit code: 0" in final

View File

@ -8,8 +8,8 @@ import pytest
from nanobot.utils.media_decode import (
DEFAULT_MAX_BYTES,
FileSizeExceeded,
MAX_FILE_SIZE,
FileSizeExceeded,
save_base64_data_url,
)
@ -25,6 +25,31 @@ def test_saves_png_with_correct_extension(tmp_path) -> None:
assert (tmp_path / result.split("/")[-1]).read_bytes() == b"fake png"
def test_saves_data_url_with_mime_parameters(tmp_path) -> None:
result = save_base64_data_url(_data_url(b"voice", mime="audio/webm;codecs=opus"), tmp_path)
assert result is not None
assert result.endswith(".webm")
assert (tmp_path / result.split("/")[-1]).read_bytes() == b"voice"
@pytest.mark.parametrize(
("mime", "suffix"),
[
("audio/webm", ".webm"),
("video/webm", ".webm"),
("audio/ogg", ".ogg"),
("audio/wav", ".wav"),
("audio/mpga", ".mpga"),
],
)
def test_saves_common_audio_with_api_friendly_extension(
tmp_path, mime: str, suffix: str
) -> None:
result = save_base64_data_url(_data_url(b"voice", mime=mime), tmp_path)
assert result is not None
assert result.endswith(suffix)
def test_returns_none_for_malformed_data_url(tmp_path) -> None:
assert save_base64_data_url("not-a-data-url", tmp_path) is None

View File

@ -18,6 +18,7 @@ from nanobot.webui.settings_api import (
update_agent_settings,
update_model_configuration,
update_network_safety_settings,
update_transcription_settings,
)
@ -243,6 +244,75 @@ def test_settings_payload_includes_network_safety_fields(
assert payload["advanced"]["ssrf_whitelist_count"] == 1
def test_settings_payload_includes_effective_transcription_config(
tmp_path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
config_path = tmp_path / "config.json"
config = Config()
config.channels.transcription_provider = "openai"
config.channels.transcription_language = "en"
config.providers.openai.api_key = "sk-test"
save_config(config, config_path)
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
payload = settings_payload()
assert payload["transcription"]["enabled"] is True
assert payload["transcription"]["provider"] == "openai"
assert payload["transcription"]["provider_configured"] is True
assert payload["transcription"]["model"] == "whisper-1"
assert payload["transcription"]["language"] == "en"
def test_update_transcription_settings_writes_top_level_only(
tmp_path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
config_path = tmp_path / "config.json"
config = Config()
config.channels.transcription_provider = "openai"
config.channels.transcription_language = "en"
config.providers.groq.api_key = "gsk-test"
save_config(config, config_path)
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
payload = update_transcription_settings(
{
"enabled": ["true"],
"provider": ["groq"],
"model": ["whisper-large-v3-turbo"],
"language": ["ko"],
"maxDurationSec": ["90"],
"maxUploadMb": ["20"],
}
)
saved = load_config(config_path)
assert saved.channels.transcription_provider == "openai"
assert saved.channels.transcription_language == "en"
assert saved.transcription.enabled is True
assert saved.transcription.provider == "groq"
assert saved.transcription.model == "whisper-large-v3-turbo"
assert saved.transcription.language == "ko"
assert saved.transcription.max_duration_sec == 90
assert saved.transcription.max_upload_mb == 20
assert payload["transcription"]["provider"] == "groq"
assert payload["transcription"]["provider_configured"] is True
def test_update_transcription_settings_validates_language(
tmp_path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
config_path = tmp_path / "config.json"
save_config(Config(), config_path)
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
with pytest.raises(WebUISettingsError, match="transcription language"):
update_transcription_settings({"language": ["en-US"]})
def test_settings_payload_includes_token_usage_summary(
tmp_path,
monkeypatch: pytest.MonkeyPatch,

View File

@ -0,0 +1,129 @@
"""Tests for WebUI transcription envelopes carried over the gateway socket."""
from __future__ import annotations
import base64
from pathlib import Path
from typing import Any
import pytest
from nanobot.config.loader import save_config
from nanobot.config.schema import Config
from nanobot.webui.transcription_ws import webui_transcription_event
def _audio_data_url(payload: bytes = b"voice", mime: str = "audio/webm") -> str:
return f"data:{mime};base64,{base64.b64encode(payload).decode('ascii')}"
@pytest.mark.asyncio
async def test_webui_transcribe_audio_rejects_unconfigured_provider(
tmp_path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
config_path = tmp_path / "config.json"
config = Config()
config.transcription.provider = "groq"
save_config(config, config_path)
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
event, payload = await webui_transcription_event({
"request_id": "voice-1",
"data_url": _audio_data_url(),
})
assert event == "transcription_error"
assert payload == {
"request_id": "voice-1",
"detail": "not_configured",
"provider": "groq",
}
@pytest.mark.asyncio
async def test_webui_transcribe_audio_rejects_unsupported_mime(
tmp_path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
config_path = tmp_path / "config.json"
config = Config()
config.transcription.provider = "groq"
config.providers.groq.api_key = "gsk-test"
save_config(config, config_path)
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
event, payload = await webui_transcription_event({
"request_id": "voice-1",
"data_url": _audio_data_url(mime="text/plain"),
})
assert event == "transcription_error"
assert payload["request_id"] == "voice-1"
assert payload["detail"] == "mime"
@pytest.mark.asyncio
async def test_webui_transcribe_audio_rejects_oversized_audio(
tmp_path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
config_path = tmp_path / "config.json"
config = Config()
config.transcription.provider = "groq"
config.transcription.max_upload_mb = 1
config.providers.groq.api_key = "gsk-test"
save_config(config, config_path)
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
monkeypatch.setattr("nanobot.audio.transcription.get_media_dir", lambda _channel=None: tmp_path)
event, payload = await webui_transcription_event({
"request_id": "voice-1",
"data_url": _audio_data_url(payload=b"x" * (1024 * 1024 + 1)),
})
assert event == "transcription_error"
assert payload["request_id"] == "voice-1"
assert payload["detail"] == "size"
@pytest.mark.asyncio
async def test_webui_transcribe_audio_returns_text_and_removes_temp_file(
tmp_path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
config_path = tmp_path / "config.json"
media_dir = tmp_path / "media"
media_dir.mkdir()
config = Config()
config.transcription.provider = "groq"
config.providers.groq.api_key = "gsk-test"
save_config(config, config_path)
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
monkeypatch.setattr(
"nanobot.audio.transcription.get_media_dir",
lambda _channel=None: media_dir,
)
captured_paths: list[Path] = []
async def fake_transcribe_audio_file(path: str | Path, _resolved: Any) -> str:
p = Path(path)
assert p.exists()
captured_paths.append(p)
return "hello voice"
monkeypatch.setattr(
"nanobot.audio.transcription.transcribe_audio_file",
fake_transcribe_audio_file,
)
event, payload = await webui_transcription_event({
"request_id": "voice-1",
"data_url": _audio_data_url(payload=b"webm voice", mime="audio/webm;codecs=opus"),
"duration_ms": 1200,
})
assert event == "transcription_result"
assert payload == {"request_id": "voice-1", "text": "hello voice"}
assert captured_paths
assert not captured_paths[0].exists()

View File

@ -81,6 +81,7 @@ const SETTINGS_SECTION_KEYS: SettingsSectionKey[] = [
"appearance",
"models",
"image",
"voice",
"browser",
"apps",
"skills",

View File

@ -1,8 +1,9 @@
import { Suspense, lazy, useCallback, useState } from "react";
import { Suspense, lazy, useCallback, useState, type ReactNode } from "react";
import { Check, Copy } from "lucide-react";
import { useTranslation } from "react-i18next";
import { useThemeValue } from "@/hooks/useTheme";
import { hasAnsi, parseAnsiSegments, stripAnsi } from "@/lib/ansi";
import { cn } from "@/lib/utils";
interface CodeBlockProps {
@ -36,6 +37,10 @@ const CODE_FONT_STACK = [
"monospace",
].join(", ");
const ANSI_LANGUAGES = new Set(["ansi", "ansi-output"]);
const CODE_SURFACE_LIGHT = "#f4f4f5";
const CODE_SURFACE_DARK = "#27272a";
const LazyHighlightedCode = lazy(async () => {
const [
{ default: SyntaxHighlighter },
@ -74,7 +79,11 @@ const LazyHighlightedCode = lazy(async () => {
language={language || "text"}
style={transparentTheme}
customStyle={{
background: chrome === "none" ? "transparent" : undefined,
background: chrome === "none"
? "transparent"
: isDark
? CODE_SURFACE_DARK
: CODE_SURFACE_LIGHT,
margin: 0,
padding: chrome === "none" ? "0.75rem 1rem" : "1rem",
fontFamily: CODE_FONT_STACK,
@ -83,10 +92,10 @@ const LazyHighlightedCode = lazy(async () => {
tabSize: 2,
}}
codeTagProps={{
style: chrome === "none" ? {
style: {
background: "transparent",
fontFamily: CODE_FONT_STACK,
} : undefined,
},
}}
lineNumberStyle={{
minWidth: "2.6em",
@ -106,14 +115,32 @@ const LazyHighlightedCode = lazy(async () => {
};
});
function PlainCodeFallback({
function renderPlainText(value: string): ReactNode {
return value;
}
function renderAnsiText(value: string): ReactNode {
return parseAnsiSegments(value).map((segment, index) => (
<span key={index} style={segment.style}>
{segment.text}
</span>
));
}
function CodeTextBlock({
code,
chrome,
showLineNumbers,
testId,
className,
renderText = renderPlainText,
}: {
code: string;
chrome: "default" | "none";
showLineNumbers: boolean;
testId: string;
className?: string;
renderText?: (value: string) => ReactNode;
}) {
const lines = code.split("\n");
return (
@ -121,10 +148,11 @@ function PlainCodeFallback({
className={cn(
"m-0 overflow-x-auto p-4 font-mono text-sm leading-[1.6] text-foreground/90",
showLineNumbers ? "whitespace-pre" : "whitespace-pre-wrap",
chrome === "default" ? "bg-background" : "bg-transparent",
chrome === "default" ? "bg-zinc-100 dark:bg-zinc-800" : "bg-transparent",
chrome === "none" && "p-3 text-[13px] leading-[1.55]",
className,
)}
data-testid="plain-code-fallback"
data-testid={testId}
>
<code className="text-inherit">
{showLineNumbers ? (
@ -133,16 +161,21 @@ function PlainCodeFallback({
<span className="w-10 shrink-0 select-none pr-4 text-right text-muted-foreground/60">
{index + 1}
</span>
<span className="whitespace-pre">{line || " "}</span>
<span className="whitespace-pre">{renderText(line || " ")}</span>
{index < lines.length - 1 ? "\n" : null}
</span>
))
) : code}
) : renderText(code)}
</code>
</pre>
);
}
function shouldRenderAnsi(language: string | undefined, code: string): boolean {
const normalized = language?.trim().toLowerCase();
return Boolean((normalized && ANSI_LANGUAGES.has(normalized)) || hasAnsi(code));
}
export function CodeBlock({
language,
code,
@ -156,19 +189,20 @@ export function CodeBlock({
const [copied, setCopied] = useState(false);
const isDark = useThemeValue() === "dark";
const hasChrome = chrome === "default";
const renderAnsi = shouldRenderAnsi(language, code);
const onCopy = useCallback(() => {
if (!navigator.clipboard) return;
navigator.clipboard.writeText(code).then(() => {
navigator.clipboard.writeText(renderAnsi ? stripAnsi(code) : code).then(() => {
setCopied(true);
setTimeout(() => setCopied(false), 1_500);
});
}, [code]);
}, [code, renderAnsi]);
return (
<div
className={cn(
"overflow-hidden",
"not-prose overflow-hidden",
hasChrome && "rounded-lg border",
hasChrome && (isDark ? "border-white/10" : "border-black/10"),
className,
@ -177,7 +211,7 @@ export function CodeBlock({
{hasChrome ? (
<div
className={cn(
"flex items-center justify-between px-4 py-1.5 text-xs font-medium",
"flex items-center justify-between px-4 pb-1.5 pt-2 text-xs font-medium",
isDark
? "bg-zinc-800 text-zinc-300"
: "bg-zinc-100 text-zinc-600",
@ -206,13 +240,22 @@ export function CodeBlock({
</button>
</div>
) : null}
{highlight ? (
{renderAnsi ? (
<CodeTextBlock
code={code}
chrome={chrome}
showLineNumbers={showLineNumbers}
testId="ansi-code"
renderText={renderAnsiText}
/>
) : highlight ? (
<Suspense
fallback={
<PlainCodeFallback
<CodeTextBlock
code={code}
chrome={chrome}
showLineNumbers={showLineNumbers}
testId="plain-code-fallback"
/>
}
>
@ -226,10 +269,11 @@ export function CodeBlock({
/>
</Suspense>
) : (
<PlainCodeFallback
<CodeTextBlock
code={code}
chrome={chrome}
showLineNumbers={showLineNumbers}
testId="plain-code-fallback"
/>
)}
</div>

View File

@ -31,6 +31,7 @@ import {
Layers,
Loader2,
LogOut,
Mic,
Moon,
PlayCircle,
Plus,
@ -92,6 +93,7 @@ import {
updateNetworkSafetySettings,
updateProviderSettings,
updateSettings,
updateTranscriptionSettings,
updateWebSearchSettings,
} from "@/lib/api";
import { notifyCliAppsChanged } from "@/lib/cli-app-events";
@ -115,6 +117,7 @@ import type {
ProviderModelsPayload,
SettingsPayload,
SkillSummary,
TranscriptionSettingsUpdate,
WebSearchSettingsUpdate,
WebuiDefaultAccessMode,
} from "@/lib/types";
@ -124,6 +127,7 @@ export type SettingsSectionKey =
| "appearance"
| "models"
| "image"
| "voice"
| "browser"
| "apps"
| "skills"
@ -367,6 +371,26 @@ const DEFAULT_IMAGE_GENERATION_FORM: ImageGenerationSettingsUpdate = {
maxImagesPerTurn: 4,
};
const DEFAULT_TRANSCRIPTION_FORM: TranscriptionSettingsUpdate = {
enabled: true,
provider: "groq",
model: "",
language: "",
maxDurationSec: 120,
maxUploadMb: 25,
};
const DEFAULT_TRANSCRIPTION_SETTINGS: NonNullable<SettingsPayload["transcription"]> = {
enabled: true,
provider: "groq",
provider_configured: false,
model: "whisper-large-v3",
language: null,
max_duration_sec: 120,
max_upload_mb: 25,
providers: [],
};
const DEFAULT_NETWORK_SAFETY_FORM: NetworkSafetySettingsUpdate = {
webuiAllowLocalServiceAccess: true,
webuiDefaultAccessMode: "default",
@ -419,6 +443,18 @@ function imageGenerationFormFromPayload(payload: SettingsPayload): ImageGenerati
};
}
function transcriptionFormFromPayload(payload: SettingsPayload): TranscriptionSettingsUpdate {
const transcription = payload.transcription ?? DEFAULT_TRANSCRIPTION_SETTINGS;
return {
enabled: transcription.enabled,
provider: transcription.provider,
model: transcription.model,
language: transcription.language ?? "",
maxDurationSec: transcription.max_duration_sec,
maxUploadMb: transcription.max_upload_mb,
};
}
function networkSafetyFormFromPayload(payload: SettingsPayload): NetworkSafetySettingsUpdate {
return {
webuiAllowLocalServiceAccess:
@ -479,6 +515,7 @@ export function SettingsView({
const [providerSaving, setProviderSaving] = useState<string | null>(null);
const [webSearchSaving, setWebSearchSaving] = useState(false);
const [imageGenerationSaving, setImageGenerationSaving] = useState(false);
const [transcriptionSaving, setTranscriptionSaving] = useState(false);
const [networkSafetySaving, setNetworkSafetySaving] = useState(false);
const [hostEngineApplying, setHostEngineApplying] = useState(false);
const [error, setError] = useState<string | null>(null);
@ -511,6 +548,9 @@ export function SettingsView({
? imageGenerationFormFromPayload(initialSettings)
: DEFAULT_IMAGE_GENERATION_FORM,
);
const [transcriptionForm, setTranscriptionForm] = useState<TranscriptionSettingsUpdate>(
() => initialSettings ? transcriptionFormFromPayload(initialSettings) : DEFAULT_TRANSCRIPTION_FORM,
);
const [networkSafetyForm, setNetworkSafetyForm] = useState<NetworkSafetySettingsUpdate>(() =>
initialSettings ? networkSafetyFormFromPayload(initialSettings) : DEFAULT_NETWORK_SAFETY_FORM,
);
@ -543,6 +583,7 @@ export function SettingsView({
setForm(agentDraftFromPayload(payload));
setWebSearchForm((prev) => webSearchFormFromPayload(payload, prev));
setImageGenerationForm(imageGenerationFormFromPayload(payload));
setTranscriptionForm(transcriptionFormFromPayload(payload));
setNetworkSafetyForm(networkSafetyFormFromPayload(payload));
if (payload.restart_required_sections) {
setPendingRestartSections(pendingRestartSectionsFromPayload(payload));
@ -711,6 +752,19 @@ export function SettingsView({
);
}, [imageGenerationForm, settings]);
const transcriptionDirty = useMemo(() => {
if (!settings) return false;
const transcription = settings.transcription ?? DEFAULT_TRANSCRIPTION_SETTINGS;
return (
transcriptionForm.enabled !== transcription.enabled ||
transcriptionForm.provider !== transcription.provider ||
transcriptionForm.model !== transcription.model ||
transcriptionForm.language !== (transcription.language ?? "") ||
transcriptionForm.maxDurationSec !== transcription.max_duration_sec ||
transcriptionForm.maxUploadMb !== transcription.max_upload_mb
);
}, [settings, transcriptionForm]);
const networkSafetyDirty = useMemo(() => {
if (!settings) return false;
const currentLocalServiceAccess =
@ -913,6 +967,24 @@ export function SettingsView({
}
};
const saveTranscriptionSettings = async () => {
if (!settings || !transcriptionDirty || transcriptionSaving) return;
setTranscriptionSaving(true);
try {
const payload = await updateTranscriptionSettings(token, transcriptionForm);
applyPayload(payload);
if (payload.requires_restart) {
setPendingRestartSections((prev) => ({ ...prev, browser: true }));
}
await maybeRestartHostEngine(payload);
setError(null);
} catch (err) {
setError((err as Error).message);
} finally {
setTranscriptionSaving(false);
}
};
const saveNetworkSafetySettings = async () => {
if (!settings || !networkSafetyDirty || networkSafetySaving) return;
setNetworkSafetySaving(true);
@ -1333,6 +1405,22 @@ export function SettingsView({
requiresRestartPending={pendingRestartSections.image}
/>
);
case "voice":
return (
<TranscriptionSettings
settings={settings}
form={transcriptionForm}
dirty={transcriptionDirty}
saving={transcriptionSaving}
onChangeForm={setTranscriptionForm}
onSave={saveTranscriptionSettings}
onOpenProviders={() => selectSection("models")}
showBrandLogos={localPrefs.brandLogos}
onRestart={restartViaSettingsSurface}
isRestarting={isRestarting || hostEngineApplying}
requiresRestartPending={pendingRestartSections.browser}
/>
);
case "browser":
return (
<WebSettings
@ -1523,6 +1611,7 @@ const SETTINGS_NAV_ITEMS: Array<{ key: SettingsSectionKey; icon: LucideIcon; fal
{ key: "appearance", icon: Palette, fallback: "Appearance" },
{ key: "models", icon: SlidersHorizontal, fallback: "Models" },
{ key: "image", icon: ImageIcon, fallback: "Image" },
{ key: "voice", icon: Mic, fallback: "Voice" },
{ key: "browser", icon: Globe2, fallback: "Web" },
{ key: "runtime", icon: Server, fallback: "System" },
{ key: "advanced", icon: ShieldCheck, fallback: "Security" },
@ -1642,6 +1731,24 @@ function OverviewSettings({
const webStatus = settings.web.enable
? tx("settings.values.enabled", "Enabled")
: tx("settings.values.disabled", "Disabled");
const webSearchProvider =
settings.web_search.providers.find((provider) => provider.name === settings.web_search.provider) ??
settings.web_search.providers[0];
const webSearchProviderLabel = providerDisplayLabel(
settings.web_search.providers,
settings.web_search.provider,
);
const webSearchCredentialStatus =
webSearchProvider?.credential === "none"
? tx("settings.byok.webSearch.noCredentialRequired", "No key required")
: webSearchProvider?.credential === "base_url"
? settings.web_search.base_url
? tx("settings.values.configured", "Configured")
: tx("settings.values.notConfigured", "Not configured")
: settings.web_search.api_key_hint
? tx("settings.values.configured", "Configured")
: tx("settings.values.notConfigured", "Not configured");
const webCaption = `${webSearchProviderLabel} · ${webSearchCredentialStatus}`;
const imageStatus = settings.image_generation.enabled
? tx("settings.values.enabled", "Enabled")
: tx("settings.values.disabled", "Disabled");
@ -1650,6 +1757,15 @@ function OverviewSettings({
? tx("settings.values.configured", "Configured")
: tx("settings.values.notConfigured", "Not configured")
}`;
const transcription = settings.transcription ?? DEFAULT_TRANSCRIPTION_SETTINGS;
const voiceStatus = transcription.enabled
? tx("settings.values.enabled", "Enabled")
: tx("settings.values.disabled", "Disabled");
const voiceCaption = `${providerDisplayLabel(transcription.providers, transcription.provider)} · ${
transcription.provider_configured
? tx("settings.values.configured", "Configured")
: tx("settings.values.notConfigured", "Not configured")
}`;
const isNativeHost = (settings.surface ?? settings.runtime_surface) === "native";
const workspaceCaption = shortWorkspacePath(settings.runtime.workspace_path);
const runtimeTitle = isNativeHost
@ -1691,8 +1807,8 @@ function OverviewSettings({
icon={Globe2}
valueLogoProvider={settings.web_search.provider}
title={tx("settings.overview.webSearch", "Web search")}
value={providerDisplayLabel(settings.web_search.providers, settings.web_search.provider)}
caption={webStatus}
value={webStatus}
caption={webCaption}
showBrandLogos={showBrandLogos}
onClick={() => onSelectSection("browser")}
/>
@ -1705,6 +1821,15 @@ function OverviewSettings({
showBrandLogos={showBrandLogos}
onClick={() => onSelectSection("image")}
/>
<OverviewListRow
icon={Mic}
valueLogoProvider={transcription.provider}
title={tx("settings.overview.voiceInput", "Voice input")}
value={voiceStatus}
caption={voiceCaption}
showBrandLogos={showBrandLogos}
onClick={() => onSelectSection("voice")}
/>
</SettingsGroup>
</section>
@ -2654,6 +2779,137 @@ function ImageGenerationSettings({
);
}
function TranscriptionSettings({
settings,
form,
dirty,
saving,
onChangeForm,
onSave,
onOpenProviders,
showBrandLogos,
onRestart,
isRestarting,
requiresRestartPending,
}: {
settings: SettingsPayload;
form: TranscriptionSettingsUpdate;
dirty: boolean;
saving: boolean;
onChangeForm: Dispatch<SetStateAction<TranscriptionSettingsUpdate>>;
onSave: () => void;
onOpenProviders: () => void;
showBrandLogos: boolean;
onRestart?: () => void;
isRestarting?: boolean;
requiresRestartPending: boolean;
}) {
const { t } = useTranslation();
const tx = (key: string, fallback: string) => t(key, { defaultValue: fallback });
const transcription = settings.transcription ?? DEFAULT_TRANSCRIPTION_SETTINGS;
const selectedProvider =
transcription.providers.find((provider) => provider.name === form.provider) ??
transcription.providers[0];
const providerConfigured = !!selectedProvider?.configured;
return (
<section>
<SettingsSectionTitle>{tx("settings.sections.voiceInput", "Voice input")}</SettingsSectionTitle>
<SettingsGroup>
<SettingsRow
title={tx("settings.rows.transcription", "Transcription")}
description={tx("settings.help.transcription", "Transcribe microphone input before sending it. Chat channel voice messages use the same settings.")}
>
<ToggleButton
checked={form.enabled}
onChange={(enabled) => onChangeForm((prev) => ({ ...prev, enabled }))}
ariaLabel={tx("settings.rows.transcription", "Transcription")}
label={form.enabled ? tx("settings.values.on", "On") : tx("settings.values.off", "Off")}
/>
</SettingsRow>
<SettingsRow
title={tx("settings.rows.transcriptionProvider", "Provider")}
description={tx("settings.help.transcriptionProvider", "Uses the matching provider credentials from Providers.")}
>
<ProviderPicker
providers={transcription.providers}
value={form.provider}
emptyLabel={tx("settings.voice.selectProvider", "Select provider")}
showProviderLogos={showBrandLogos}
onChange={(provider) => onChangeForm((prev) => ({ ...prev, provider }))}
/>
</SettingsRow>
<SettingsRow
title={tx("settings.rows.transcriptionProviderStatus", "Provider status")}
description={tx("settings.help.transcriptionProviderStatus", "API keys stay under providers, not in transcription settings.")}
>
<div className="flex flex-wrap items-center justify-end gap-2">
<StatusPill tone={providerConfigured ? "success" : "neutral"}>
{providerConfigured
? tx("settings.values.configured", "Configured")
: tx("settings.values.notConfigured", "Not configured")}
</StatusPill>
{!providerConfigured ? (
<Button size="sm" variant="outline" onClick={onOpenProviders} className="rounded-full">
{tx("settings.voice.configureProvider", "Configure provider")}
</Button>
) : null}
</div>
</SettingsRow>
<SettingsRow
title={tx("settings.rows.transcriptionModel", "Model")}
description={tx("settings.help.transcriptionModel", "Leave as the resolved default unless your provider needs a custom model id.")}
>
<Input
value={form.model}
onChange={(event) => onChangeForm((prev) => ({ ...prev, model: event.target.value }))}
className="h-8 w-[min(300px,70vw)] rounded-full text-[13px]"
/>
</SettingsRow>
<SettingsRow
title={tx("settings.rows.transcriptionLanguage", "Language")}
description={tx("settings.help.transcriptionLanguage", "Optional ISO-639 hint such as en, zh, ja, or ko.")}
>
<Input
value={form.language}
onChange={(event) => onChangeForm((prev) => ({ ...prev, language: event.target.value }))}
placeholder={tx("settings.voice.languageAuto", "Auto")}
className="h-8 w-[min(180px,60vw)] rounded-full text-[13px]"
/>
</SettingsRow>
<SettingsRow title={tx("settings.rows.voiceLimits", "Limits")}>
<div className="flex flex-wrap justify-end gap-2">
<NumberInput
value={form.maxDurationSec}
min={1}
max={600}
suffix="s"
onChange={(maxDurationSec) => onChangeForm((prev) => ({ ...prev, maxDurationSec }))}
/>
<NumberInput
value={form.maxUploadMb}
min={1}
max={100}
suffix="MB"
onChange={(maxUploadMb) => onChangeForm((prev) => ({ ...prev, maxUploadMb }))}
/>
</div>
</SettingsRow>
<RestartSettingsFooter
dirty={dirty}
saving={saving}
pendingRestart={requiresRestartPending}
dirtyMessage={tx("settings.status.restartAfterSaving", "Save changes, then restart when ready.")}
pendingMessage={tx("settings.status.savedRestartApply", "Saved. Restart when ready.")}
onSave={onSave}
onRestart={onRestart}
isRestarting={isRestarting}
/>
</SettingsGroup>
</section>
);
}
function WebSettings({
settings,
form,

View File

@ -78,16 +78,13 @@ function buildTokenUsageCalendar(
const today = utcDateFromIsoDay(isoDayInTimeZone(new Date(), timeZone));
const end = addUtcDays(today, 6 - today.getUTCDay());
const start = addUtcDays(end, -(TOKEN_HEATMAP_CELLS - 1));
const seenMonths = new Set<string>();
const monthLabels: TokenUsageMonthLabel[] = [];
const cells = Array.from({ length: TOKEN_HEATMAP_CELLS }, (_, index) => {
const date = addUtcDays(start, index);
const key = isoDay(date);
const row = byDate.get(key);
const monthKey = key.slice(0, 7);
if (!seenMonths.has(monthKey)) {
seenMonths.add(monthKey);
if (date.getUTCDate() === 1) {
monthLabels.push({
label: monthFormatter.format(date),
column: Math.floor(index / 7) + 1,
@ -186,16 +183,12 @@ export function TokenUsageHeatmap({
{tx("settings.usage.shortTitle", "Token Usage")}
</span>
</div>
<div
className="mb-2 grid min-h-4 gap-1.5 text-[10px] font-normal leading-4 text-muted-foreground/62"
style={{ gridTemplateColumns: `repeat(${TOKEN_HEATMAP_COLUMNS}, minmax(0, 1fr))` }}
aria-hidden
>
<div className="relative mb-2 h-4 text-[10px] font-normal leading-4 text-muted-foreground/62" aria-hidden>
{monthLabels.map((month) => (
<span
key={`${month.label}-${month.column}`}
className="whitespace-nowrap"
style={{ gridColumnStart: month.column, gridColumnEnd: "span 4" }}
className="absolute top-0 whitespace-nowrap"
style={{ left: `${((month.column - 1) / TOKEN_HEATMAP_COLUMNS) * 100}%` }}
>
{month.label}
</span>

View File

@ -31,6 +31,7 @@ import {
History,
ImageIcon,
Loader2,
Mic,
Plus,
RotateCw,
Shield,
@ -46,6 +47,12 @@ import {
import { useTranslation } from "react-i18next";
import { Button } from "@/components/ui/button";
import {
Tooltip,
TooltipContent,
TooltipProvider,
TooltipTrigger,
} from "@/components/ui/tooltip";
import {
WorkspaceAccessMenu,
WorkspaceProjectPicker,
@ -59,6 +66,7 @@ import {
} from "@/hooks/useAttachedImages";
import { useClipboardAndDrop } from "@/hooks/useClipboardAndDrop";
import type { SendImage, SendOptions } from "@/hooks/useNanobotStream";
import { useVoiceRecorder, type VoiceRecorderErrorKey } from "@/hooks/useVoiceRecorder";
import type {
CliAppInfo,
GoalStateWsPayload,
@ -79,6 +87,9 @@ import { cn } from "@/lib/utils";
/** ``<input accept>``: aligned with the server's MIME whitelist. SVG is
* deliberately excluded to avoid an embedded-script XSS surface. */
const ACCEPT_ATTR = "image/png,image/jpeg,image/webp,image/gif";
const VOICE_SHORTCUT_CODE = "KeyD";
const VOICE_SHORTCUT_ARIA = "Control+Shift+D";
type VoiceShortcutPlatform = "apple" | "chromeos" | "linux" | "other" | "windows";
function formatBytes(n: number): string {
if (n < 1024) return `${n} B`;
@ -86,6 +97,54 @@ function formatBytes(n: number): string {
return `${(n / (1024 * 1024)).toFixed(1)} MB`;
}
function isVoiceShortcutDown(event: KeyboardEvent): boolean {
return (
event.code === VOICE_SHORTCUT_CODE
&& event.ctrlKey
&& event.shiftKey
&& !event.altKey
&& !event.metaKey
);
}
function isVoiceShortcutRelease(event: KeyboardEvent): boolean {
return (
event.code === VOICE_SHORTCUT_CODE
|| event.key === "Control"
|| event.key === "Shift"
);
}
function getVoiceShortcutPlatform(): VoiceShortcutPlatform {
if (typeof navigator === "undefined") return "other";
const userAgentData = (navigator as Navigator & { userAgentData?: { platform?: string } })
.userAgentData;
const platform = [
userAgentData?.platform,
navigator.platform,
navigator.userAgent,
].filter(Boolean).join(" ").toLowerCase();
const isIpadPretendingToBeMac =
navigator.platform === "MacIntel" && navigator.maxTouchPoints > 1;
if (isIpadPretendingToBeMac || /mac|iphone|ipad|ipod/.test(platform)) return "apple";
if (/win/.test(platform)) return "windows";
if (/cros/.test(platform)) return "chromeos";
if (/linux|x11|android/.test(platform)) return "linux";
return "other";
}
function getVoiceShortcutLabel(): string {
switch (getVoiceShortcutPlatform()) {
case "apple":
return "⌃⇧D";
case "chromeos":
case "linux":
case "windows":
case "other":
return "Ctrl ⇧ D";
}
}
interface ThreadComposerProps {
onSend: (content: string, images?: SendImage[], options?: SendOptions) => void;
disabled?: boolean;
@ -101,6 +160,7 @@ interface ThreadComposerProps {
cliApps?: CliAppInfo[];
mcpPresets?: McpPresetInfo[];
onStop?: () => void;
onTranscribeAudio?: (dataUrl: string, options?: { durationMs?: number }) => Promise<string>;
/** Unix seconds from server; turn elapsed timer above input while set. */
runStartedAt?: number | null;
/** Sustained objective for this chat (WebSocket ``goal_state``). */
@ -138,6 +198,45 @@ const QUEUED_PROMPTS_STORAGE_PREFIX = "nanobot.webui.composerQueuedGuidance.v1:"
const QUEUED_PROMPTS_LIMIT = 20;
const QUEUED_PROMPT_MAX_CHARS = 4000;
function VoiceRecordingMeter({
ariaLabel,
className,
elapsedLabel,
isHero,
levels,
}: {
ariaLabel: string;
className?: string;
elapsedLabel: string;
isHero: boolean;
levels: number[];
}) {
return (
<div
className={cn(
"flex min-w-0 items-center gap-2 text-neutral-700 dark:text-white",
isHero ? "h-8" : "h-9",
className,
)}
aria-live="polite"
aria-label={ariaLabel}
>
<span className="flex h-5 min-w-0 flex-1 items-center justify-between overflow-hidden" aria-hidden>
{levels.map((height, index) => (
<span
key={index}
className="w-[2px] rounded-full bg-current opacity-85 transition-[height] duration-75 ease-linear motion-reduce:transition-none"
style={{ height }}
/>
))}
</span>
<span className="min-w-[2.1rem] text-right text-[12px] font-medium tabular-nums text-muted-foreground">
{elapsedLabel}
</span>
</div>
);
}
type SlashPalettePlacement = "above" | "below";
interface SlashPaletteLayout {
@ -656,6 +755,7 @@ export function ThreadComposer({
cliApps = [],
mcpPresets = [],
onStop,
onTranscribeAudio,
runStartedAt = null,
goalState,
workspaceScope = null,
@ -685,7 +785,9 @@ export function ThreadComposer({
const wasStreamingRef = useRef(isStreaming);
const skipNextQueuedFlushRef = useRef(false);
const skipQueuedPromptPersistRef = useRef(false);
const voiceShortcutDownRef = useRef(false);
const isHero = variant === "hero";
const voiceShortcutLabel = useMemo(getVoiceShortcutLabel, []);
const queuedPromptStorageKey = useMemo(
() => queuedPromptsStorageKey(pendingQueueKey),
[pendingQueueKey],
@ -1026,6 +1128,65 @@ export function ThreadComposer({
});
}, []);
const appendTranscription = useCallback((text: string) => {
const transcript = text.trim();
if (!transcript) return;
setValue((current) => {
if (!current.trim()) return transcript;
const separator = /[\s\n]$/.test(current) ? "" : " ";
return `${current}${separator}${transcript}`;
});
setSlashMenuDismissed(false);
setCliAppMenuDismissed(false);
setInlineError(null);
resizeTextarea();
}, [resizeTextarea]);
const clearInlineError = useCallback(() => setInlineError(null), []);
const setVoiceError = useCallback((key: VoiceRecorderErrorKey) => {
setInlineError(t(`thread.composer.voiceErrors.${key}`));
}, [t]);
const voiceRecorder = useVoiceRecorder({
disabled,
onClearError: clearInlineError,
onError: setVoiceError,
onTranscript: appendTranscription,
onTranscribeAudio,
});
useEffect(() => {
if (!onTranscribeAudio) return;
function onKeyDown(event: KeyboardEvent): void {
if (!isVoiceShortcutDown(event) || event.repeat || voiceShortcutDownRef.current) return;
event.preventDefault();
voiceShortcutDownRef.current = true;
voiceRecorder.beginShortcutHold();
}
function onKeyUp(event: KeyboardEvent): void {
if (!voiceShortcutDownRef.current || !isVoiceShortcutRelease(event)) return;
event.preventDefault();
voiceShortcutDownRef.current = false;
voiceRecorder.endShortcutHold();
}
function onWindowBlur(): void {
if (!voiceShortcutDownRef.current) return;
voiceShortcutDownRef.current = false;
voiceRecorder.endShortcutHold();
}
window.addEventListener("keydown", onKeyDown);
window.addEventListener("keyup", onKeyUp);
window.addEventListener("blur", onWindowBlur);
return () => {
window.removeEventListener("keydown", onKeyDown);
window.removeEventListener("keyup", onKeyUp);
window.removeEventListener("blur", onWindowBlur);
};
}, [onTranscribeAudio, voiceRecorder.beginShortcutHold, voiceRecorder.endShortcutHold]);
const chooseSlashCommand = useCallback(
(command: SlashCommand) => {
if (command.command === "/stop" && isStreaming && onStop) {
@ -1341,6 +1502,23 @@ export function ThreadComposer({
);
const attachButtonDisabled = disabled || full;
const showVoiceButton = Boolean(onTranscribeAudio);
const voiceRecordingStatusLabel = t("thread.composer.voice.recordingStatus", {
time: voiceRecorder.elapsedLabel,
defaultValue: `Recording ${voiceRecorder.elapsedLabel}`,
});
const voiceButtonLabel =
voiceRecorder.state === "recording"
? t("thread.composer.voice.stop")
: voiceRecorder.state === "transcribing"
? t("thread.composer.voice.transcribing")
: t("thread.composer.tools.voice");
const voiceButtonTooltip =
voiceRecorder.state === "recording"
? t("thread.composer.voice.stop")
: voiceRecorder.state === "transcribing"
? t("thread.composer.voice.transcribing")
: t("thread.composer.voice.hint");
const showStopButton = isStreaming && !!onStop;
const relaxedHeroInput = isHero && images.length === 0 && !isStreaming;
const inputTextClasses = cn(
@ -1531,7 +1709,15 @@ export function ThreadComposer({
>
<Plus className={cn(isHero ? "h-[18px] w-[18px]" : "h-4 w-4")} />
</Button>
{workspaceScope ? (
{voiceRecorder.isRecording ? (
<VoiceRecordingMeter
ariaLabel={voiceRecordingStatusLabel}
className="mx-1 flex-1"
elapsedLabel={voiceRecorder.elapsedLabel}
isHero={isHero}
levels={voiceRecorder.levels}
/>
) : workspaceScope ? (
<WorkspaceAccessMenu
scope={workspaceScope}
disabled={disabled || workspaceScopeDisabled}
@ -1542,7 +1728,7 @@ export function ThreadComposer({
) : null}
</div>
<div className={cn("flex shrink-0 items-center", isHero ? "gap-1.5" : "gap-2")}>
{modelLabel ? (
{modelLabel && !voiceRecorder.isRecording ? (
<ComposerModelBadge
label={modelLabel}
provider={modelProvider}
@ -1552,6 +1738,53 @@ export function ThreadComposer({
onClick={modelNeedsSetup ? onModelBadgeClick : undefined}
/>
) : null}
{showVoiceButton ? (
<TooltipProvider delayDuration={220} skipDelayDuration={80}>
<Tooltip>
<TooltipTrigger asChild>
<Button
type="button"
size="icon"
variant="ghost"
disabled={voiceRecorder.buttonDisabled}
aria-label={voiceButtonLabel}
aria-keyshortcuts={VOICE_SHORTCUT_ARIA}
title={voiceButtonTooltip}
onPointerDown={voiceRecorder.beginPress}
onPointerUp={voiceRecorder.endPress}
onPointerCancel={voiceRecorder.endPress}
onClick={voiceRecorder.handleClick}
className={cn(
"rounded-full border border-transparent text-muted-foreground hover:bg-muted/65 hover:text-foreground",
isHero ? "h-8 w-8" : "h-9 w-9",
voiceRecorder.isRecording &&
"bg-red-500 text-white shadow-[0_8px_20px_rgba(239,68,68,0.22)] hover:bg-red-500 hover:text-white",
)}
>
{voiceRecorder.state === "transcribing" ? (
<Loader2 className={cn(isHero ? "h-4 w-4" : "h-4 w-4", "animate-spin")} />
) : voiceRecorder.isRecording ? (
<Square className={cn(isHero ? "h-3.5 w-3.5" : "h-3.5 w-3.5")} fill="currentColor" />
) : (
<Mic className={cn(isHero ? "h-4 w-4" : "h-4 w-4")} />
)}
</Button>
</TooltipTrigger>
<TooltipContent
side="top"
align="center"
className="flex items-center gap-2 rounded-full border border-border/70 bg-background px-3 py-1.5 text-[13px] font-medium text-foreground shadow-[0_8px_24px_rgba(15,23,42,0.13)] dark:border-white/10 dark:bg-neutral-900 dark:text-white"
>
<span>{voiceButtonTooltip}</span>
{voiceRecorder.state === "idle" ? (
<kbd className="rounded-full bg-muted px-2 py-0.5 font-sans text-[12px] font-semibold leading-none text-muted-foreground dark:bg-white/10 dark:text-white/80">
{voiceShortcutLabel}
</kbd>
) : null}
</TooltipContent>
</Tooltip>
</TooltipProvider>
) : null}
<Button
type={showStopButton || modelNeedsSetup ? "button" : "submit"}
size="icon"

View File

@ -302,6 +302,7 @@ export function ThreadShell({
runStartedAt,
goalState,
send,
transcribeAudio,
stop,
setMessages,
streamError,
@ -642,6 +643,7 @@ export function ThreadShell({
cliApps={cliApps}
mcpPresets={mcpPresets}
onStop={stop}
onTranscribeAudio={transcribeAudio}
runStartedAt={runStartedAt}
goalState={goalState}
workspaceScope={workspaceScope}
@ -672,6 +674,7 @@ export function ThreadShell({
cliApps={cliApps}
mcpPresets={mcpPresets}
runStartedAt={runStartedAt}
onTranscribeAudio={transcribeAudio}
goalState={goalState}
workspaceScope={workspaceScope}
workspaceDefaultScope={workspaceDefaultScope}

View File

@ -438,6 +438,7 @@ export function useNanobotStream(
/** Latest sustained goal for this ``chatId`` (``goal_state`` WS events). */
goalState: GoalStateWsPayload | undefined;
send: (content: string, images?: SendImage[], options?: SendOptions) => void;
transcribeAudio: (dataUrl: string, options?: { durationMs?: number }) => Promise<string>;
stop: () => void;
setMessages: React.Dispatch<React.SetStateAction<UIMessage[]>>;
/** Latest transport-level fault raised since the last ``dismissStreamError``.
@ -1089,12 +1090,19 @@ export function useNanobotStream(
client.sendMessage(chatId, "/stop");
}, [chatId, clearActivitySegment, client, flushPendingStreamEvents]);
const transcribeAudio = useCallback(
(dataUrl: string, options?: { durationMs?: number }) =>
client.transcribeAudio(dataUrl, options),
[client],
);
return {
messages,
isStreaming,
runStartedAt,
goalState,
send,
transcribeAudio,
stop,
setMessages,
streamError,

View File

@ -0,0 +1,422 @@
import {
useCallback,
useEffect,
useRef,
useState,
type PointerEvent as ReactPointerEvent,
} from "react";
const VOICE_RECORDING_MAX_MS = 120_000;
const VOICE_RECORDING_MIN_MS = 650;
const VOICE_NO_INPUT_HINT_MS = 1_100;
const VOICE_HOLD_START_MS = 140;
const VOICE_WAVEFORM_BAR_COUNT = 64;
const VOICE_WAVEFORM_SILENT_HEIGHT = 3;
const VOICE_WAVEFORM_MIN_HEIGHT = 7;
const VOICE_WAVEFORM_MAX_HEIGHT = 34;
const VOICE_MIN_LEVEL = 0.018;
const VOICE_WAVEFORM_IDLE_LEVELS = Array.from(
{ length: VOICE_WAVEFORM_BAR_COUNT },
() => VOICE_WAVEFORM_SILENT_HEIGHT,
);
const VOICE_MIME_CANDIDATES = [
"audio/webm;codecs=opus",
"audio/webm",
"audio/mp4",
"audio/ogg;codecs=opus",
] as const;
export type VoiceRecorderState = "idle" | "recording" | "transcribing";
export type VoiceRecorderErrorKey =
| "failed"
| "noInput"
| "notConfigured"
| "permission"
| "tooLong"
| "tooShort"
| "unsupported";
interface VoiceRecorderOptions {
disabled?: boolean;
onClearError: () => void;
onError: (key: VoiceRecorderErrorKey) => void;
onTranscript: (text: string) => void;
onTranscribeAudio?: (dataUrl: string, options?: { durationMs?: number }) => Promise<string>;
}
export function useVoiceRecorder({
disabled,
onClearError,
onError,
onTranscript,
onTranscribeAudio,
}: VoiceRecorderOptions) {
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
const chunksRef = useRef<BlobPart[]>([]);
const streamRef = useRef<MediaStream | null>(null);
const audioRef = useRef<VoiceAudioState | null>(null);
const startedAtRef = useRef(0);
const maxTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
const inputHintTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
const holdTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
const holdActiveRef = useRef(false);
const startPendingRef = useRef(false);
const stopAfterStartRef = useRef(false);
const suppressClickRef = useRef(false);
const suppressClickTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
const shortcutActiveRef = useRef(false);
const levelObservedRef = useRef(false);
const peakLevelRef = useRef(0);
const levelReliableRef = useRef(false);
const noInputHintVisibleRef = useRef(false);
const [state, setState] = useState<VoiceRecorderState>("idle");
const [elapsedMs, setElapsedMs] = useState(0);
const [levels, setLevels] = useState<number[]>(VOICE_WAVEFORM_IDLE_LEVELS);
const clearInputHintTimer = useCallback(() => clearTimer(inputHintTimerRef), []);
const clearSuppressClickTimer = useCallback(() => clearTimer(suppressClickTimerRef), []);
const suppressNextClick = useCallback(() => {
clearSuppressClickTimer();
suppressClickRef.current = true;
suppressClickTimerRef.current = setTimeout(() => {
suppressClickRef.current = false;
suppressClickTimerRef.current = null;
}, 500);
}, [clearSuppressClickTimer]);
const stopWaveform = useCallback(() => {
const audio = audioRef.current;
audioRef.current = null;
if (!audio) return;
if (audio.frame !== null) cancelAnimationFrame(audio.frame);
audio.source.disconnect();
audio.analyser.disconnect();
void audio.context.close().catch(() => undefined);
}, []);
const startWaveform = useCallback((stream: MediaStream) => {
const AudioContextCtor = audioContextConstructor();
if (!AudioContextCtor) return;
stopWaveform();
setLevels(VOICE_WAVEFORM_IDLE_LEVELS);
try {
const context = new AudioContextCtor();
const source = context.createMediaStreamSource(stream);
const analyser = context.createAnalyser();
analyser.fftSize = 256;
analyser.smoothingTimeConstant = 0.68;
source.connect(analyser);
const audio: VoiceAudioState = {
analyser,
context,
data: new Uint8Array(analyser.fftSize),
frame: null,
source,
};
const tick = () => {
const current = audioRef.current;
if (!current) return;
if (current.context.state !== "running") {
void current.context.resume().catch(() => undefined);
current.frame = requestAnimationFrame(tick);
return;
}
current.analyser.getByteTimeDomainData(current.data);
const level = voiceLevelFromSamples(current.data);
levelReliableRef.current = true;
levelObservedRef.current = true;
peakLevelRef.current = Math.max(peakLevelRef.current, level);
if (level >= VOICE_MIN_LEVEL) {
clearInputHintTimer();
if (noInputHintVisibleRef.current) {
noInputHintVisibleRef.current = false;
onClearError();
}
}
setLevels((currentLevels) => [
...currentLevels.slice(1),
waveformHeightFromLevel(level),
]);
current.frame = requestAnimationFrame(tick);
};
audioRef.current = audio;
void context.resume().catch(() => undefined);
audio.frame = requestAnimationFrame(tick);
} catch {
stopWaveform();
}
}, [clearInputHintTimer, onClearError, stopWaveform]);
const cleanupRecording = useCallback(() => {
clearTimer(holdTimerRef);
clearInputHintTimer();
clearTimer(maxTimerRef);
stopWaveform();
streamRef.current?.getTracks().forEach((track) => track.stop());
streamRef.current = null;
mediaRecorderRef.current = null;
startPendingRef.current = false;
shortcutActiveRef.current = false;
noInputHintVisibleRef.current = false;
}, [clearInputHintTimer, stopWaveform]);
const stopRecording = useCallback(() => {
const recorder = mediaRecorderRef.current;
if (!recorder || recorder.state === "inactive") return;
recorder.stop();
}, []);
const stopRecordingWhenReady = useCallback(() => {
const recorder = mediaRecorderRef.current;
if (recorder && recorder.state !== "inactive") {
stopRecording();
} else if (startPendingRef.current) {
stopAfterStartRef.current = true;
}
}, [stopRecording]);
const startRecording = useCallback(async () => {
if (!onTranscribeAudio || state !== "idle" || startPendingRef.current) return;
if (!navigator.mediaDevices?.getUserMedia || typeof MediaRecorder === "undefined") {
onError("unsupported");
return;
}
startPendingRef.current = true;
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
const recorder = new MediaRecorder(stream, mediaRecorderOptions());
chunksRef.current = [];
streamRef.current = stream;
mediaRecorderRef.current = recorder;
startedAtRef.current = Date.now();
levelObservedRef.current = false;
peakLevelRef.current = 0;
levelReliableRef.current = false;
noInputHintVisibleRef.current = false;
setElapsedMs(0);
startWaveform(stream);
recorder.ondataavailable = (event) => {
if (event.data.size > 0) chunksRef.current.push(event.data);
};
recorder.onstop = () => {
const chunks = chunksRef.current.splice(0);
const durationMs = Math.max(0, Date.now() - startedAtRef.current);
const mimeType = recorder.mimeType || "audio/webm";
const hasMeasuredSilence =
levelReliableRef.current
&& levelObservedRef.current
&& peakLevelRef.current < VOICE_MIN_LEVEL;
cleanupRecording();
if (chunks.length === 0) {
setState("idle");
return;
}
if (durationMs < VOICE_RECORDING_MIN_MS) {
setState("idle");
onError("tooShort");
return;
}
if (hasMeasuredSilence) {
setState("idle");
onError("noInput");
return;
}
setState("transcribing");
void blobToDataUrl(new Blob(chunks, { type: mimeType }))
.then((dataUrl) => onTranscribeAudio(dataUrl, { durationMs }))
.then(onTranscript)
.catch((error) => onError(transcriptionErrorKey(error)))
.finally(() => setState("idle"));
};
recorder.start();
setState("recording");
onClearError();
maxTimerRef.current = setTimeout(stopRecording, VOICE_RECORDING_MAX_MS);
inputHintTimerRef.current = setTimeout(() => {
const recording = mediaRecorderRef.current?.state === "recording";
if (
!recording
|| !levelReliableRef.current
|| !levelObservedRef.current
|| peakLevelRef.current >= VOICE_MIN_LEVEL
) {
return;
}
noInputHintVisibleRef.current = true;
onError("noInput");
}, VOICE_NO_INPUT_HINT_MS);
} catch {
cleanupRecording();
setState("idle");
onError("permission");
}
}, [
cleanupRecording,
onClearError,
onError,
onTranscribeAudio,
onTranscript,
startWaveform,
state,
stopRecording,
]);
const startRecordingWithDeferredStop = useCallback(() => {
stopAfterStartRef.current = false;
void startRecording().then(() => {
if (!stopAfterStartRef.current) return;
stopAfterStartRef.current = false;
stopRecording();
});
}, [startRecording, stopRecording]);
const beginPress = useCallback((event: ReactPointerEvent<HTMLButtonElement>) => {
if (event.pointerType === "mouse" && event.button !== 0) return;
if (!onTranscribeAudio || disabled || state !== "idle") return;
clearTimer(holdTimerRef);
try {
event.currentTarget.setPointerCapture(event.pointerId);
} catch {
// Some embedded runtimes do not expose pointer capture for toolbar buttons.
}
holdTimerRef.current = setTimeout(() => {
holdTimerRef.current = null;
holdActiveRef.current = true;
suppressNextClick();
startRecordingWithDeferredStop();
}, VOICE_HOLD_START_MS);
}, [disabled, onTranscribeAudio, startRecordingWithDeferredStop, state, suppressNextClick]);
const endPress = useCallback(() => {
const wasHoldRecording = holdActiveRef.current;
clearTimer(holdTimerRef);
if (!wasHoldRecording) return;
holdActiveRef.current = false;
suppressNextClick();
stopRecordingWhenReady();
}, [stopRecordingWhenReady, suppressNextClick]);
const handleClick = useCallback(() => {
if (suppressClickRef.current) {
clearSuppressClickTimer();
suppressClickRef.current = false;
return;
}
if (state === "recording") stopRecording();
else void startRecording();
}, [clearSuppressClickTimer, startRecording, state, stopRecording]);
const beginShortcutHold = useCallback(() => {
if (!onTranscribeAudio || disabled || state !== "idle" || shortcutActiveRef.current) return;
shortcutActiveRef.current = true;
startRecordingWithDeferredStop();
}, [disabled, onTranscribeAudio, startRecordingWithDeferredStop, state]);
const endShortcutHold = useCallback(() => {
if (!shortcutActiveRef.current) return;
shortcutActiveRef.current = false;
stopRecordingWhenReady();
}, [stopRecordingWhenReady]);
useEffect(() => {
if (state !== "recording") {
setElapsedMs(0);
return;
}
const updateElapsed = () => {
setElapsedMs(Math.max(0, Date.now() - startedAtRef.current));
};
updateElapsed();
const interval = window.setInterval(updateElapsed, 250);
return () => window.clearInterval(interval);
}, [state]);
useEffect(() => cleanupRecording, [cleanupRecording]);
useEffect(() => () => clearSuppressClickTimer(), [clearSuppressClickTimer]);
return {
beginShortcutHold,
beginPress,
buttonDisabled: disabled || state === "transcribing",
elapsedLabel: formatVoiceElapsed(elapsedMs),
endShortcutHold,
endPress,
handleClick,
isRecording: state === "recording",
levels,
state,
};
}
interface VoiceAudioState {
analyser: AnalyserNode;
context: AudioContext;
data: Uint8Array<ArrayBuffer>;
frame: number | null;
source: MediaStreamAudioSourceNode;
}
function clearTimer(ref: { current: ReturnType<typeof setTimeout> | null }) {
if (ref.current !== null) {
clearTimeout(ref.current);
ref.current = null;
}
}
function mediaRecorderOptions(): MediaRecorderOptions | undefined {
if (typeof MediaRecorder === "undefined") return undefined;
const mimeType = VOICE_MIME_CANDIDATES.find((type) => MediaRecorder.isTypeSupported(type));
return mimeType ? { mimeType } : undefined;
}
function formatVoiceElapsed(ms: number): string {
const seconds = Math.max(0, Math.floor(ms / 1000));
const minutes = Math.floor(seconds / 60);
return `${minutes}:${String(seconds % 60).padStart(2, "0")}`;
}
function audioContextConstructor(): typeof AudioContext | undefined {
if (typeof window === "undefined") return undefined;
return window.AudioContext
?? (window as unknown as { webkitAudioContext?: typeof AudioContext }).webkitAudioContext;
}
function voiceLevelFromSamples(samples: ArrayLike<number>): number {
if (samples.length === 0) return 0;
let sum = 0;
for (let index = 0; index < samples.length; index += 1) {
const centered = (samples[index] - 128) / 128;
sum += centered * centered;
}
const rms = Math.sqrt(sum / samples.length);
return Math.min(1, Math.pow(rms * 4.2, 0.72));
}
function waveformHeightFromLevel(level: number): number {
if (level < VOICE_MIN_LEVEL) return VOICE_WAVEFORM_SILENT_HEIGHT;
const activeLevel = Math.min(1, (level - VOICE_MIN_LEVEL) / (1 - VOICE_MIN_LEVEL));
return Math.round(
VOICE_WAVEFORM_MIN_HEIGHT
+ activeLevel * (VOICE_WAVEFORM_MAX_HEIGHT - VOICE_WAVEFORM_MIN_HEIGHT),
);
}
function blobToDataUrl(blob: Blob): Promise<string> {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = () => {
if (typeof reader.result === "string") resolve(reader.result);
else reject(new Error("invalid_data_url"));
};
reader.onerror = () => reject(reader.error ?? new Error("read_failed"));
reader.readAsDataURL(blob);
});
}
function transcriptionErrorKey(error: unknown): VoiceRecorderErrorKey {
const detail = error instanceof Error ? error.message : "";
if (detail === "not_configured") return "notConfigured";
if (detail === "duration") return "tooLong";
return "failed";
}

View File

@ -73,6 +73,7 @@
"models": "Models",
"providers": "Providers",
"image": "Image",
"voice": "Voice",
"browser": "Web",
"cliApps": "CLI Apps",
"mcp": "MCP",
@ -99,7 +100,8 @@
"capabilities": "Capabilities",
"apps": "Apps",
"nativeHost": "Native host",
"hostSafety": "App safety"
"hostSafety": "App safety",
"voiceInput": "Voice input"
},
"models": {
"selectModel": "Select model",
@ -161,7 +163,13 @@
"engine": "Engine",
"logs": "Logs",
"diagnostics": "Diagnostics",
"contextWindow": "Context window"
"contextWindow": "Context window",
"transcription": "Transcription",
"transcriptionProvider": "Provider",
"transcriptionProviderStatus": "Provider status",
"transcriptionModel": "Model",
"transcriptionLanguage": "Language",
"voiceLimits": "Limits"
},
"help": {
"theme": "Switch between light and dark appearance.",
@ -200,7 +208,12 @@
"diagnostics": "Export a small runtime report for support.",
"localServiceAccessNative": "Allow Full Access shell commands to reach services on this Mac.",
"webuiDefaultAccessNative": "Used by native chats without a project-specific permission.",
"contextWindow": "Choose the default context budget for this model configuration."
"contextWindow": "Choose the default context budget for this model configuration.",
"transcription": "Transcribe microphone input before sending it. Chat channel voice messages use the same settings.",
"transcriptionProvider": "Uses the matching provider credentials from Providers.",
"transcriptionProviderStatus": "API keys stay under providers, not in transcription settings.",
"transcriptionModel": "Leave as the resolved default unless your provider needs a custom model id.",
"transcriptionLanguage": "Optional ISO-639 hint such as en, zh, ja, or ko."
},
"timezone": {
"select": "Select timezone",
@ -391,6 +404,7 @@
"totalProviders": "{{count}} available",
"webSearch": "Web search",
"imageGeneration": "Image generation",
"voiceInput": "Voice input",
"workspace": "Workspace"
},
"usage": {
@ -486,6 +500,11 @@
"rawInstructions": "Raw SKILL.md",
"rawInstructionsEmpty": "No raw instructions.",
"detailDescription": "Details for {{name}}."
},
"voice": {
"selectProvider": "Select provider",
"configureProvider": "Configure provider",
"languageAuto": "Auto"
}
},
"chat": {
@ -678,6 +697,21 @@
"deepResearch": "Deep research",
"voice": "Voice input"
},
"voice": {
"hint": "Click to dictate or hold",
"stop": "Stop recording",
"transcribing": "Transcribing...",
"recordingStatus": "Recording {{time}}"
},
"voiceErrors": {
"unsupported": "Voice input is not supported in this browser.",
"permission": "Microphone permission is required.",
"notConfigured": "Configure a transcription provider first.",
"tooLong": "Recording is too long.",
"tooShort": "Hold a little longer to record voice.",
"noInput": "No microphone input detected.",
"failed": "Could not transcribe audio."
},
"slash": {
"ariaLabel": "Slash commands",
"label": "commands",

View File

@ -73,6 +73,7 @@
"models": "Modelos",
"providers": "Proveedores",
"image": "Imagen",
"voice": "Voz",
"browser": "Internet",
"runtime": "Sistema",
"advanced": "Seguridad",
@ -99,7 +100,8 @@
"mcp": "Servicios MCP",
"apps": "Aplicaciones",
"nativeHost": "Host nativo",
"hostSafety": "Seguridad de la app"
"hostSafety": "Seguridad de la app",
"voiceInput": "Entrada de voz"
},
"rows": {
"theme": "Tema",
@ -142,7 +144,13 @@
"engine": "Motor",
"logs": "Registros",
"diagnostics": "Diagnóstico",
"contextWindow": "Ventana de contexto"
"contextWindow": "Ventana de contexto",
"transcription": "Transcripcion",
"transcriptionProvider": "Proveedor",
"transcriptionProviderStatus": "Estado del proveedor",
"transcriptionModel": "Modelo",
"transcriptionLanguage": "Idioma",
"voiceLimits": "Limites"
},
"help": {
"theme": "Cambia entre apariencia clara y oscura.",
@ -181,7 +189,12 @@
"diagnostics": "Exporta un pequeño informe de runtime para soporte.",
"localServiceAccessNative": "Permite que comandos shell con Full Access alcancen servicios en este Mac.",
"webuiDefaultAccessNative": "Usado por chats nativos sin permiso específico de proyecto.",
"contextWindow": "Elige el presupuesto de contexto predeterminado para esta configuración de modelo."
"contextWindow": "Elige el presupuesto de contexto predeterminado para esta configuración de modelo.",
"transcription": "Transcribe la entrada del microfono antes de enviarla. Los mensajes de voz de los canales usan la misma configuracion.",
"transcriptionProvider": "Usa las credenciales del proveedor correspondiente en Proveedores.",
"transcriptionProviderStatus": "Las claves API permanecen en proveedores, no en la configuracion de transcripcion.",
"transcriptionModel": "Dejalo como el valor predeterminado resuelto salvo que el proveedor necesite un id de modelo personalizado.",
"transcriptionLanguage": "Pista ISO-639 opcional, como en, zh, ja o ko."
},
"values": {
"light": "Claro",
@ -283,6 +296,7 @@
"totalProviders": "{{count}} disponibles",
"webSearch": "Búsqueda web",
"imageGeneration": "Generación de imágenes",
"voiceInput": "Entrada de voz",
"workspace": "Espacio de trabajo"
},
"usage": {
@ -486,6 +500,11 @@
"rawInstructions": "SKILL.md original",
"rawInstructionsEmpty": "No hay instrucciones originales.",
"detailDescription": "Detalles de {{name}}."
},
"voice": {
"selectProvider": "Seleccionar proveedor",
"configureProvider": "Configurar proveedor",
"languageAuto": "Auto"
}
},
"chat": {
@ -678,6 +697,21 @@
"deepResearch": "Investigación profunda",
"voice": "Entrada de voz"
},
"voice": {
"hint": "Haz clic para dictar o mantén",
"stop": "Detener grabación",
"transcribing": "Transcribiendo...",
"recordingStatus": "Grabando {{time}}"
},
"voiceErrors": {
"unsupported": "Este navegador no admite entrada de voz.",
"permission": "Se requiere permiso de micrófono.",
"notConfigured": "Configura primero un proveedor de transcripción.",
"tooLong": "La grabación es demasiado larga.",
"tooShort": "Mantén pulsado un poco más para grabar voz.",
"noInput": "No se detectó entrada del micrófono.",
"failed": "No se pudo transcribir el audio."
},
"slash": {
"ariaLabel": "Comandos slash",
"label": "comandos",

View File

@ -73,6 +73,7 @@
"models": "Modèles",
"providers": "Fournisseurs",
"image": "Images",
"voice": "Voix",
"browser": "Internet",
"runtime": "Système",
"advanced": "Sécurité",
@ -99,7 +100,8 @@
"mcp": "Services MCP",
"apps": "Applications",
"nativeHost": "Hôte natif",
"hostSafety": "Sécurité de lapp"
"hostSafety": "Sécurité de lapp",
"voiceInput": "Saisie vocale"
},
"rows": {
"theme": "Thème",
@ -142,7 +144,13 @@
"engine": "Moteur",
"logs": "Journaux",
"diagnostics": "Diagnostic",
"contextWindow": "Fenêtre de contexte"
"contextWindow": "Fenêtre de contexte",
"transcription": "Transcription",
"transcriptionProvider": "Fournisseur",
"transcriptionProviderStatus": "Etat du fournisseur",
"transcriptionModel": "Modele",
"transcriptionLanguage": "Langue",
"voiceLimits": "Limites"
},
"help": {
"theme": "Basculer entre lapparence claire et sombre.",
@ -181,7 +189,12 @@
"diagnostics": "Exporte un petit rapport dexécution pour le support.",
"localServiceAccessNative": "Autorise les commandes shell Full Access à atteindre les services sur ce Mac.",
"webuiDefaultAccessNative": "Utilisé par les chats natifs sans permission propre au projet.",
"contextWindow": "Choisissez le budget de contexte par défaut pour cette configuration de modèle."
"contextWindow": "Choisissez le budget de contexte par défaut pour cette configuration de modèle.",
"transcription": "Transcrit l'entree micro avant l'envoi. Les messages vocaux des canaux utilisent les memes reglages.",
"transcriptionProvider": "Utilise les identifiants du fournisseur correspondant dans Fournisseurs.",
"transcriptionProviderStatus": "Les cles API restent dans les fournisseurs, pas dans les reglages de transcription.",
"transcriptionModel": "Laissez le modele resolu par defaut sauf si votre fournisseur exige un id personnalise.",
"transcriptionLanguage": "Indice ISO-639 facultatif, comme en, zh, ja ou ko."
},
"values": {
"light": "Clair",
@ -283,6 +296,7 @@
"totalProviders": "{{count}} disponibles",
"webSearch": "Recherche web",
"imageGeneration": "Génération dimages",
"voiceInput": "Saisie vocale",
"workspace": "Espace de travail"
},
"usage": {
@ -486,6 +500,11 @@
"rawInstructions": "SKILL.md brut",
"rawInstructionsEmpty": "Aucune instruction brute.",
"detailDescription": "Détails de {{name}}."
},
"voice": {
"selectProvider": "Choisir un fournisseur",
"configureProvider": "Configurer le fournisseur",
"languageAuto": "Auto"
}
},
"chat": {
@ -678,6 +697,21 @@
"deepResearch": "Recherche approfondie",
"voice": "Entrée vocale"
},
"voice": {
"hint": "Cliquez pour dicter ou maintenez",
"stop": "Arrêter l'enregistrement",
"transcribing": "Transcription...",
"recordingStatus": "Enregistrement {{time}}"
},
"voiceErrors": {
"unsupported": "La saisie vocale n'est pas prise en charge par ce navigateur.",
"permission": "L'autorisation du microphone est requise.",
"notConfigured": "Configurez d'abord un fournisseur de transcription.",
"tooLong": "L'enregistrement est trop long.",
"tooShort": "Maintenez un peu plus longtemps pour enregistrer la voix.",
"noInput": "Aucune entrée microphone détectée.",
"failed": "Impossible de transcrire l'audio."
},
"slash": {
"ariaLabel": "Commandes slash",
"label": "commandes",

View File

@ -73,6 +73,7 @@
"models": "Model",
"providers": "Penyedia",
"image": "Gambar",
"voice": "Suara",
"browser": "Internet",
"runtime": "Sistem",
"advanced": "Keamanan",
@ -99,7 +100,8 @@
"mcp": "Layanan MCP",
"apps": "Aplikasi",
"nativeHost": "Host native",
"hostSafety": "Keamanan aplikasi"
"hostSafety": "Keamanan aplikasi",
"voiceInput": "Input suara"
},
"rows": {
"theme": "Tema",
@ -142,7 +144,13 @@
"engine": "Mesin",
"logs": "Log",
"diagnostics": "Diagnostik",
"contextWindow": "Jendela konteks"
"contextWindow": "Jendela konteks",
"transcription": "Transkripsi",
"transcriptionProvider": "Penyedia",
"transcriptionProviderStatus": "Status penyedia",
"transcriptionModel": "Model",
"transcriptionLanguage": "Bahasa",
"voiceLimits": "Batas"
},
"help": {
"theme": "Beralih antara tampilan terang dan gelap.",
@ -181,7 +189,12 @@
"diagnostics": "Exporta un pequeño informe de runtime para soporte.",
"localServiceAccessNative": "Permite que comandos shell con Full Access alcancen servicios en este Mac.",
"webuiDefaultAccessNative": "Usado por chats nativos sin permiso específico de proyecto.",
"contextWindow": "Pilih anggaran konteks default untuk konfigurasi model ini."
"contextWindow": "Pilih anggaran konteks default untuk konfigurasi model ini.",
"transcription": "Transkripsikan input mikrofon sebelum dikirim. Pesan suara channel memakai pengaturan yang sama.",
"transcriptionProvider": "Menggunakan kredensial penyedia yang sesuai dari Providers.",
"transcriptionProviderStatus": "API key tetap berada di providers, bukan di pengaturan transkripsi.",
"transcriptionModel": "Biarkan memakai default yang teresolusi kecuali penyedia membutuhkan id model khusus.",
"transcriptionLanguage": "Petunjuk ISO-639 opsional, seperti en, zh, ja, atau ko."
},
"values": {
"light": "Terang",
@ -283,6 +296,7 @@
"totalProviders": "{{count}} tersedia",
"webSearch": "Pencarian web",
"imageGeneration": "Pembuatan gambar",
"voiceInput": "Input suara",
"workspace": "Ruang kerja"
},
"usage": {
@ -486,6 +500,11 @@
"rawInstructions": "SKILL.md mentah",
"rawInstructionsEmpty": "Tidak ada instruksi mentah.",
"detailDescription": "Detail untuk {{name}}."
},
"voice": {
"selectProvider": "Pilih penyedia",
"configureProvider": "Konfigurasi penyedia",
"languageAuto": "Auto"
}
},
"chat": {
@ -678,6 +697,21 @@
"deepResearch": "Riset mendalam",
"voice": "Input suara"
},
"voice": {
"hint": "Klik untuk mendikte atau tahan",
"stop": "Hentikan rekaman",
"transcribing": "Mentranskripsi...",
"recordingStatus": "Merekam {{time}}"
},
"voiceErrors": {
"unsupported": "Input suara tidak didukung di browser ini.",
"permission": "Izin mikrofon diperlukan.",
"notConfigured": "Konfigurasikan penyedia transkripsi terlebih dahulu.",
"tooLong": "Rekaman terlalu panjang.",
"tooShort": "Tahan sedikit lebih lama untuk merekam suara.",
"noInput": "Tidak ada input mikrofon yang terdeteksi.",
"failed": "Tidak dapat mentranskripsi audio."
},
"slash": {
"ariaLabel": "Perintah slash",
"label": "perintah",

View File

@ -73,6 +73,7 @@
"models": "モデル",
"providers": "プロバイダー",
"image": "画像",
"voice": "音声",
"browser": "ウェブ",
"runtime": "システム",
"advanced": "セキュリティ",
@ -99,7 +100,8 @@
"mcp": "MCP サービス",
"apps": "アプリ",
"nativeHost": "ネイティブホスト",
"hostSafety": "アプリの安全性"
"hostSafety": "アプリの安全性",
"voiceInput": "音声入力"
},
"rows": {
"theme": "テーマ",
@ -142,7 +144,13 @@
"engine": "エンジン",
"logs": "ログ",
"diagnostics": "診断",
"contextWindow": "コンテキストウィンドウ"
"contextWindow": "コンテキストウィンドウ",
"transcription": "文字起こし",
"transcriptionProvider": "プロバイダー",
"transcriptionProviderStatus": "プロバイダー状態",
"transcriptionModel": "モデル",
"transcriptionLanguage": "言語",
"voiceLimits": "制限"
},
"help": {
"theme": "ライト表示とダーク表示を切り替えます。",
@ -181,7 +189,12 @@
"diagnostics": "サポート用の小さなランタイムレポートを書き出します。",
"localServiceAccessNative": "Full Access の shell コマンドがこの Mac 上のサービスにアクセスできるようにします。",
"webuiDefaultAccessNative": "プロジェクト固有の権限がないネイティブチャットで使用します。",
"contextWindow": "このモデル設定で使う既定のコンテキスト予算を選択します。"
"contextWindow": "このモデル設定で使う既定のコンテキスト予算を選択します。",
"transcription": "マイク入力を送信前に文字起こしします。チャネルの音声メッセージも同じ設定を使います。",
"transcriptionProvider": "プロバイダー設定にある対応する認証情報を使います。",
"transcriptionProviderStatus": "APIキーは文字起こし設定ではなくプロバイダー側に保存されます。",
"transcriptionModel": "プロバイダーがカスタムモデルIDを必要としない限り、解決済みのデフォルトのままにします。",
"transcriptionLanguage": "en、zh、ja、ko などの任意の ISO-639 ヒント。"
},
"values": {
"light": "ライト",
@ -283,6 +296,7 @@
"totalProviders": "{{count}} 個利用可能",
"webSearch": "Web 検索",
"imageGeneration": "画像生成",
"voiceInput": "音声入力",
"workspace": "ワークスペース"
},
"usage": {
@ -486,6 +500,11 @@
"rawInstructions": "元の SKILL.md",
"rawInstructionsEmpty": "元の説明はありません。",
"detailDescription": "{{name}} の詳細。"
},
"voice": {
"selectProvider": "プロバイダーを選択",
"configureProvider": "プロバイダーを設定",
"languageAuto": "自動"
}
},
"chat": {
@ -678,6 +697,21 @@
"deepResearch": "詳細調査",
"voice": "音声入力"
},
"voice": {
"hint": "クリックして音声入力、または長押し",
"stop": "録音を停止",
"transcribing": "文字起こし中...",
"recordingStatus": "録音中 {{time}}"
},
"voiceErrors": {
"unsupported": "このブラウザーは音声入力に対応していません。",
"permission": "マイクの許可が必要です。",
"notConfigured": "先に文字起こしプロバイダーを設定してください。",
"tooLong": "録音が長すぎます。",
"tooShort": "もう少し長く録音してください。",
"noInput": "マイク入力が検出されませんでした。",
"failed": "音声を文字起こしできませんでした。"
},
"slash": {
"ariaLabel": "スラッシュコマンド",
"label": "コマンド",

View File

@ -73,6 +73,7 @@
"models": "모델",
"providers": "제공자",
"image": "이미지",
"voice": "음성",
"browser": "웹",
"runtime": "시스템",
"advanced": "보안",
@ -99,7 +100,8 @@
"mcp": "MCP 서비스",
"apps": "앱",
"nativeHost": "네이티브 호스트",
"hostSafety": "앱 보안"
"hostSafety": "앱 보안",
"voiceInput": "음성 입력"
},
"rows": {
"theme": "테마",
@ -142,7 +144,13 @@
"engine": "엔진",
"logs": "로그",
"diagnostics": "진단",
"contextWindow": "컨텍스트 창"
"contextWindow": "컨텍스트 창",
"transcription": "전사",
"transcriptionProvider": "제공자",
"transcriptionProviderStatus": "제공자 상태",
"transcriptionModel": "모델",
"transcriptionLanguage": "언어",
"voiceLimits": "제한"
},
"help": {
"theme": "밝은 모드와 어두운 모드를 전환합니다.",
@ -181,7 +189,12 @@
"diagnostics": "지원용 작은 런타임 보고서를 내보냅니다.",
"localServiceAccessNative": "Full Access shell 명령이 이 Mac의 서비스에 접근할 수 있게 합니다.",
"webuiDefaultAccessNative": "프로젝트별 권한이 없는 네이티브 채팅에 사용됩니다.",
"contextWindow": "이 모델 구성의 기본 컨텍스트 예산을 선택합니다."
"contextWindow": "이 모델 구성의 기본 컨텍스트 예산을 선택합니다.",
"transcription": "마이크 입력을 보내기 전에 텍스트로 변환합니다. 채널 음성 메시지도 같은 설정을 사용합니다.",
"transcriptionProvider": "Providers에 저장된 해당 제공자의 인증 정보를 사용합니다.",
"transcriptionProviderStatus": "API 키는 transcription 설정이 아니라 providers 아래에 유지됩니다.",
"transcriptionModel": "제공자가 사용자 지정 모델 ID를 요구하지 않으면 해석된 기본값을 사용하세요.",
"transcriptionLanguage": "en, zh, ja, ko 같은 선택적 ISO-639 힌트입니다."
},
"values": {
"light": "라이트",
@ -283,6 +296,7 @@
"totalProviders": "{{count}}개 사용 가능",
"webSearch": "웹 검색",
"imageGeneration": "이미지 생성",
"voiceInput": "음성 입력",
"workspace": "작업공간"
},
"usage": {
@ -486,6 +500,11 @@
"rawInstructions": "원본 SKILL.md",
"rawInstructionsEmpty": "원본 지침이 없습니다.",
"detailDescription": "{{name}} 세부 정보."
},
"voice": {
"selectProvider": "제공자 선택",
"configureProvider": "제공자 설정",
"languageAuto": "자동"
}
},
"chat": {
@ -678,6 +697,21 @@
"deepResearch": "심층 조사",
"voice": "음성 입력"
},
"voice": {
"hint": "클릭해 받아쓰거나 길게 누르기",
"stop": "녹음 중지",
"transcribing": "변환 중...",
"recordingStatus": "녹음 중 {{time}}"
},
"voiceErrors": {
"unsupported": "이 브라우저는 음성 입력을 지원하지 않습니다.",
"permission": "마이크 권한이 필요합니다.",
"notConfigured": "먼저 음성 변환 제공업체를 설정하세요.",
"tooLong": "녹음 시간이 너무 깁니다.",
"tooShort": "음성을 녹음하려면 조금 더 길게 눌러 주세요.",
"noInput": "마이크 입력이 감지되지 않았습니다.",
"failed": "오디오를 변환하지 못했습니다."
},
"slash": {
"ariaLabel": "슬래시 명령",
"label": "명령",

View File

@ -73,6 +73,7 @@
"models": "Mô hình",
"providers": "Nhà cung cấp",
"image": "Hình ảnh",
"voice": "Giọng nói",
"browser": "Trang web",
"runtime": "Hệ thống",
"advanced": "Bảo mật",
@ -99,7 +100,8 @@
"mcp": "Dịch vụ MCP",
"apps": "Ứng dụng",
"nativeHost": "Host gốc",
"hostSafety": "An toàn ứng dụng"
"hostSafety": "An toàn ứng dụng",
"voiceInput": "Nhap giong noi"
},
"rows": {
"theme": "Chủ đề",
@ -142,7 +144,13 @@
"engine": "Bộ máy",
"logs": "Nhật ký",
"diagnostics": "Chẩn đoán",
"contextWindow": "Cửa sổ ngữ cảnh"
"contextWindow": "Cửa sổ ngữ cảnh",
"transcription": "Phien am",
"transcriptionProvider": "Nha cung cap",
"transcriptionProviderStatus": "Trang thai nha cung cap",
"transcriptionModel": "Mo hinh",
"transcriptionLanguage": "Ngon ngu",
"voiceLimits": "Gioi han"
},
"help": {
"theme": "Chuyển giữa giao diện sáng và tối.",
@ -181,7 +189,12 @@
"diagnostics": "Exporta un pequeño informe de runtime para soporte.",
"localServiceAccessNative": "Permite que comandos shell con Full Access alcancen servicios en este Mac.",
"webuiDefaultAccessNative": "Usado por chats nativos sin permiso específico de proyecto.",
"contextWindow": "Chọn ngân sách ngữ cảnh mặc định cho cấu hình mô hình này."
"contextWindow": "Chọn ngân sách ngữ cảnh mặc định cho cấu hình mô hình này.",
"transcription": "Phien am dau vao micro truoc khi gui. Tin nhan giong noi tu kenh chat dung cung cai dat.",
"transcriptionProvider": "Dung thong tin xac thuc cua nha cung cap tu Providers.",
"transcriptionProviderStatus": "API key nam trong providers, khong nam trong cai dat transcription.",
"transcriptionModel": "Giu mac dinh da resolve tru khi nha cung cap can id model tuy chinh.",
"transcriptionLanguage": "Goi y ISO-639 tuy chon, nhu en, zh, ja hoac ko."
},
"values": {
"light": "Sáng",
@ -283,6 +296,7 @@
"totalProviders": "{{count}} khả dụng",
"webSearch": "Tìm kiếm web",
"imageGeneration": "Tạo hình ảnh",
"voiceInput": "Nhập bằng giọng nói",
"workspace": "Không gian làm việc"
},
"usage": {
@ -486,6 +500,11 @@
"rawInstructions": "SKILL.md gốc",
"rawInstructionsEmpty": "Không có hướng dẫn gốc.",
"detailDescription": "Chi tiết cho {{name}}."
},
"voice": {
"selectProvider": "Chon nha cung cap",
"configureProvider": "Cau hinh nha cung cap",
"languageAuto": "Tu dong"
}
},
"chat": {
@ -678,6 +697,21 @@
"deepResearch": "Nghiên cứu sâu",
"voice": "Nhập bằng giọng nói"
},
"voice": {
"hint": "Bấm để đọc chính tả hoặc nhấn giữ",
"stop": "Dừng ghi âm",
"transcribing": "Đang chép lời...",
"recordingStatus": "Đang ghi {{time}}"
},
"voiceErrors": {
"unsupported": "Trình duyệt này không hỗ trợ nhập bằng giọng nói.",
"permission": "Cần quyền truy cập micrô.",
"notConfigured": "Hãy cấu hình nhà cung cấp chép lời trước.",
"tooLong": "Bản ghi âm quá dài.",
"tooShort": "Giữ lâu hơn một chút để ghi âm giọng nói.",
"noInput": "Không phát hiện đầu vào micrô.",
"failed": "Không thể chép lời âm thanh."
},
"slash": {
"ariaLabel": "Lệnh slash",
"label": "lệnh",

View File

@ -73,6 +73,7 @@
"models": "模型",
"providers": "提供商",
"image": "图片",
"voice": "语音",
"browser": "网页",
"cliApps": "CLI 应用",
"mcp": "MCP",
@ -99,7 +100,8 @@
"capabilities": "能力",
"apps": "应用",
"nativeHost": "原生宿主",
"hostSafety": "应用安全"
"hostSafety": "应用安全",
"voiceInput": "语音识别"
},
"models": {
"selectModel": "选择模型",
@ -161,7 +163,13 @@
"engine": "引擎",
"logs": "日志",
"diagnostics": "诊断",
"contextWindow": "上下文窗口"
"contextWindow": "上下文窗口",
"transcription": "语音转写",
"transcriptionProvider": "提供商",
"transcriptionProviderStatus": "提供商状态",
"transcriptionModel": "模型",
"transcriptionLanguage": "语言",
"voiceLimits": "限制"
},
"help": {
"theme": "在浅色和深色外观之间切换。",
@ -200,7 +208,12 @@
"diagnostics": "导出一份用于支持排查的小型运行报告。",
"localServiceAccessNative": "允许完全访问权限下的 shell 命令访问这台 Mac 上的服务。",
"webuiDefaultAccessNative": "用于没有单独项目权限的原生聊天。",
"contextWindow": "选择此模型配置的默认上下文预算。"
"contextWindow": "选择此模型配置的默认上下文预算。",
"transcription": "发送前先把麦克风输入转写到输入框。聊天渠道里的语音消息也使用同一套设置。",
"transcriptionProvider": "使用「提供商」中对应提供商的凭据。",
"transcriptionProviderStatus": "API Key 仍保存在 providers 里,不写进 transcription 设置。",
"transcriptionModel": "除非提供商需要自定义模型 ID否则保持解析后的默认值即可。",
"transcriptionLanguage": "可选 ISO-639 语言提示,例如 en、zh、ja 或 ko。"
},
"timezone": {
"select": "选择时区",
@ -391,6 +404,7 @@
"totalProviders": "共 {{count}} 个可用",
"webSearch": "网页搜索",
"imageGeneration": "图片生成",
"voiceInput": "语音识别",
"workspace": "工作区"
},
"usage": {
@ -486,6 +500,11 @@
"rawInstructions": "原始 SKILL.md",
"rawInstructionsEmpty": "没有原始说明。",
"detailDescription": "{{name}} 的详情。"
},
"voice": {
"selectProvider": "选择提供商",
"configureProvider": "配置提供商",
"languageAuto": "自动"
}
},
"chat": {
@ -677,6 +696,21 @@
"deepResearch": "深度研究",
"voice": "语音输入"
},
"voice": {
"hint": "点击进行听写或长按",
"stop": "停止录音",
"transcribing": "正在转写...",
"recordingStatus": "正在录音 {{time}}"
},
"voiceErrors": {
"unsupported": "当前浏览器不支持语音输入。",
"permission": "需要麦克风权限。",
"notConfigured": "请先配置转写提供商。",
"tooLong": "录音时间太长。",
"tooShort": "请稍微多录一会儿。",
"noInput": "没有检测到麦克风输入。",
"failed": "语音转写失败。"
},
"slash": {
"ariaLabel": "斜杠命令",
"label": "命令",

View File

@ -73,6 +73,7 @@
"models": "模型",
"providers": "提供商",
"image": "圖片",
"voice": "語音",
"browser": "網頁",
"runtime": "系統",
"advanced": "安全",
@ -99,7 +100,8 @@
"mcp": "MCP 服務",
"apps": "應用",
"nativeHost": "原生宿主",
"hostSafety": "App 安全"
"hostSafety": "App 安全",
"voiceInput": "語音辨識"
},
"rows": {
"theme": "主題",
@ -142,7 +144,13 @@
"engine": "引擎",
"logs": "日誌",
"diagnostics": "診斷",
"contextWindow": "上下文視窗"
"contextWindow": "上下文視窗",
"transcription": "語音轉寫",
"transcriptionProvider": "提供商",
"transcriptionProviderStatus": "提供商狀態",
"transcriptionModel": "模型",
"transcriptionLanguage": "語言",
"voiceLimits": "限制"
},
"help": {
"theme": "在淺色與深色外觀之間切換。",
@ -181,7 +189,12 @@
"diagnostics": "匯出一份用於支援排查的小型執行報告。",
"localServiceAccessNative": "允許完全訪問權限下的 shell 命令訪問這台 Mac 上的服務。",
"webuiDefaultAccessNative": "用於沒有單獨專案權限的原生聊天。",
"contextWindow": "選擇此模型配置的預設上下文預算。"
"contextWindow": "選擇此模型配置的預設上下文預算。",
"transcription": "送出前先把麥克風輸入轉寫到輸入框。聊天渠道的語音訊息也使用同一組設定。",
"transcriptionProvider": "使用「提供商」中對應提供商的憑證。",
"transcriptionProviderStatus": "API Key 仍保存在 providers 裡,不寫進 transcription 設定。",
"transcriptionModel": "除非提供商需要自訂模型 ID否則保持解析後的預設值即可。",
"transcriptionLanguage": "可選 ISO-639 語言提示,例如 en、zh、ja 或 ko。"
},
"values": {
"light": "淺色",
@ -283,6 +296,7 @@
"totalProviders": "共 {{count}} 個可用",
"webSearch": "網頁搜尋",
"imageGeneration": "圖片生成",
"voiceInput": "語音辨識",
"workspace": "工作區"
},
"usage": {
@ -486,6 +500,11 @@
"rawInstructions": "原始 SKILL.md",
"rawInstructionsEmpty": "沒有原始說明。",
"detailDescription": "{{name}} 的詳細資訊。"
},
"voice": {
"selectProvider": "選擇提供商",
"configureProvider": "設定提供商",
"languageAuto": "自動"
}
},
"chat": {
@ -678,6 +697,21 @@
"deepResearch": "深度研究",
"voice": "語音輸入"
},
"voice": {
"hint": "點擊進行聽寫或長按",
"stop": "停止錄音",
"transcribing": "正在轉寫...",
"recordingStatus": "正在錄音 {{time}}"
},
"voiceErrors": {
"unsupported": "目前瀏覽器不支援語音輸入。",
"permission": "需要麥克風權限。",
"notConfigured": "請先設定轉寫提供商。",
"tooLong": "錄音時間太長。",
"tooShort": "請稍微多錄一會兒。",
"noInput": "沒有偵測到麥克風輸入。",
"failed": "語音轉寫失敗。"
},
"slash": {
"ariaLabel": "斜線命令",
"label": "命令",

210
webui/src/lib/ansi.ts Normal file
View File

@ -0,0 +1,210 @@
export type AnsiSegment = {
text: string;
style?: AnsiStyle;
};
export type AnsiStyle = {
backgroundColor?: string;
color?: string;
fontStyle?: "italic";
fontWeight?: number;
opacity?: number;
textDecorationLine?: "underline";
};
type AnsiState = {
backgroundColor?: string;
bold: boolean;
color?: string;
dim: boolean;
inverse: boolean;
italic: boolean;
underline: boolean;
};
const ESC = String.fromCharCode(27);
const ANSI_PATTERN = new RegExp(`${ESC}\\[[0-?]*[ -/]*[@-~]`, "g");
const ANSI_COLORS = [
"#000000",
"#cd3131",
"#0dbc79",
"#e5e510",
"#2472c8",
"#bc3fbc",
"#11a8cd",
"#e5e5e5",
];
const ANSI_BRIGHT_COLORS = [
"#666666",
"#f14c4c",
"#23d18b",
"#f5f543",
"#3b8eea",
"#d670d6",
"#29b8db",
"#ffffff",
];
const RGB_STEPS = [0, 95, 135, 175, 215, 255];
export function hasAnsi(value: string): boolean {
ANSI_PATTERN.lastIndex = 0;
return ANSI_PATTERN.test(value);
}
export function stripAnsi(value: string): string {
ANSI_PATTERN.lastIndex = 0;
return value.replace(ANSI_PATTERN, "");
}
function initialState(): AnsiState {
return {
bold: false,
dim: false,
inverse: false,
italic: false,
underline: false,
};
}
function colorFrom256(value: number): string | undefined {
if (value < 0 || value > 255) return undefined;
if (value < 8) return ANSI_COLORS[value];
if (value < 16) return ANSI_BRIGHT_COLORS[value - 8];
if (value < 232) {
const offset = value - 16;
const red = RGB_STEPS[Math.floor(offset / 36)];
const green = RGB_STEPS[Math.floor((offset % 36) / 6)];
const blue = RGB_STEPS[offset % 6];
return `rgb(${red}, ${green}, ${blue})`;
}
const gray = 8 + ((value - 232) * 10);
return `rgb(${gray}, ${gray}, ${gray})`;
}
function colorFromRgb(red: number, green: number, blue: number): string | undefined {
if ([red, green, blue].some((value) => !Number.isFinite(value) || value < 0 || value > 255)) {
return undefined;
}
return `rgb(${red}, ${green}, ${blue})`;
}
function normalizedSgrParams(sequence: string): number[] | null {
if (!sequence.endsWith("m")) return null;
const body = sequence.slice(2, -1).trim();
if (!body) return [0];
return body.split(/[;:]/).map((part) => {
const value = Number.parseInt(part || "0", 10);
return Number.isFinite(value) ? value : 0;
});
}
function applyExtendedColor(
state: AnsiState,
params: number[],
index: number,
key: "color" | "backgroundColor",
): number {
const mode = params[index + 1];
if (mode === 5) {
const color = colorFrom256(params[index + 2]);
if (color) state[key] = color;
return index + 2;
}
if (mode === 2) {
const color = colorFromRgb(params[index + 2], params[index + 3], params[index + 4]);
if (color) state[key] = color;
return index + 4;
}
return index;
}
function applySgrParams(state: AnsiState, params: number[]): void {
for (let index = 0; index < params.length; index += 1) {
const code = params[index];
if (code === 0) {
Object.assign(state, initialState());
} else if (code === 1) {
state.bold = true;
state.dim = false;
} else if (code === 2) {
state.dim = true;
state.bold = false;
} else if (code === 3) {
state.italic = true;
} else if (code === 4) {
state.underline = true;
} else if (code === 7) {
state.inverse = true;
} else if (code === 22) {
state.bold = false;
state.dim = false;
} else if (code === 23) {
state.italic = false;
} else if (code === 24) {
state.underline = false;
} else if (code === 27) {
state.inverse = false;
} else if (code === 39) {
delete state.color;
} else if (code === 49) {
delete state.backgroundColor;
} else if (code >= 30 && code <= 37) {
state.color = ANSI_COLORS[code - 30];
} else if (code >= 40 && code <= 47) {
state.backgroundColor = ANSI_COLORS[code - 40];
} else if (code >= 90 && code <= 97) {
state.color = ANSI_BRIGHT_COLORS[code - 90];
} else if (code >= 100 && code <= 107) {
state.backgroundColor = ANSI_BRIGHT_COLORS[code - 100];
} else if (code === 38) {
index = applyExtendedColor(state, params, index, "color");
} else if (code === 48) {
index = applyExtendedColor(state, params, index, "backgroundColor");
}
}
}
function styleFromState(state: AnsiState): AnsiStyle | undefined {
const foreground = state.inverse ? state.backgroundColor : state.color;
const background = state.inverse ? state.color : state.backgroundColor;
const style: AnsiStyle = {};
if (foreground) style.color = foreground;
if (background) style.backgroundColor = background;
if (state.bold) style.fontWeight = 700;
if (state.dim) style.opacity = 0.72;
if (state.italic) style.fontStyle = "italic";
if (state.underline) style.textDecorationLine = "underline";
return Object.keys(style).length ? style : undefined;
}
export function parseAnsiSegments(value: string): AnsiSegment[] {
const segments: AnsiSegment[] = [];
const state = initialState();
let cursor = 0;
ANSI_PATTERN.lastIndex = 0;
for (const match of value.matchAll(ANSI_PATTERN)) {
const index = match.index ?? 0;
if (index > cursor) {
segments.push({
text: value.slice(cursor, index),
style: styleFromState(state),
});
}
const params = normalizedSgrParams(match[0]);
if (params) applySgrParams(state, params);
cursor = index + match[0].length;
}
if (cursor < value.length) {
segments.push({
text: value.slice(cursor),
style: styleFromState(state),
});
}
return segments.filter((segment) => segment.text.length > 0);
}

View File

@ -16,6 +16,7 @@ import type {
SkillDetail,
SkillsPayload,
SlashCommand,
TranscriptionSettingsUpdate,
WebSearchSettingsUpdate,
WorkspacesPayload,
WebuiThreadPersistedPayload,
@ -547,3 +548,21 @@ export async function updateImageGenerationSettings(
token,
);
}
export async function updateTranscriptionSettings(
token: string,
update: TranscriptionSettingsUpdate,
base: string = "",
): Promise<SettingsPayload> {
const query = new URLSearchParams();
query.set("enabled", String(update.enabled));
query.set("provider", update.provider);
query.set("model", update.model);
query.set("language", update.language);
query.set("max_duration_sec", String(update.maxDurationSec));
query.set("max_upload_mb", String(update.maxUploadMb));
return request<SettingsPayload>(
`${base}/api/settings/transcription/update?${query}`,
token,
);
}

View File

@ -95,6 +95,12 @@ interface PendingNewChat {
timer: ReturnType<typeof setTimeout>;
}
interface PendingTranscription {
resolve: (text: string) => void;
reject: (err: Error) => void;
timer: ReturnType<typeof setTimeout>;
}
export interface NanobotClientOptions {
url: string;
reconnect?: boolean;
@ -132,6 +138,7 @@ export class NanobotClient {
/** Latest ``goal_state`` snapshot per ``chat_id`` (multi-session isolation). */
private goalStateByChatId = new Map<string, GoalStateWsPayload>();
private pendingNewChat: PendingNewChat | null = null;
private pendingTranscriptions = new Map<string, PendingTranscription>();
// Frames queued while the socket is not yet OPEN
private sendQueue: Outbound[] = [];
private reconnectAttempts = 0;
@ -320,6 +327,27 @@ export class NanobotClient {
});
}
transcribeAudio(
dataUrl: string,
options?: { durationMs?: number; timeoutMs?: number },
): Promise<string> {
const requestId = crypto.randomUUID();
const timeoutMs = options?.timeoutMs ?? 120_000;
return new Promise<string>((resolve, reject) => {
const timer = setTimeout(() => {
this.pendingTranscriptions.delete(requestId);
reject(new Error("transcription timed out"));
}, timeoutMs);
this.pendingTranscriptions.set(requestId, { resolve, reject, timer });
this.queueSend({
type: "transcribe_audio",
request_id: requestId,
data_url: dataUrl,
...(options?.durationMs !== undefined ? { duration_ms: options.durationMs } : {}),
});
});
}
attach(chatId: string): void {
this.knownChats.add(chatId);
if (this.socket?.readyState === WS_OPEN) {
@ -425,6 +453,16 @@ export class NanobotClient {
return;
}
if (parsed.event === "transcription_result") {
this.resolveTranscription(parsed.request_id, parsed.text);
return;
}
if (parsed.event === "transcription_error") {
this.rejectTranscription(parsed.request_id, parsed.detail || "error");
return;
}
if (parsed.event === "session_updated") {
this.emitSessionUpdate(parsed.chat_id, parsed.scope, parsed.workspace_scope);
return;
@ -500,6 +538,7 @@ export class NanobotClient {
this.pendingNewChat.reject(new Error("socket closed"));
this.pendingNewChat = null;
}
this.rejectAllTranscriptions("socket closed");
// Surface structured reasons *before* reconnect logic so the UI can
// display the error even while the client transparently reconnects.
// Browsers populate ``CloseEvent.code`` with the wire-level close code;
@ -528,6 +567,34 @@ export class NanobotClient {
}
}
private resolveTranscription(requestId: string, text: string): void {
const pending = this.pendingTranscriptions.get(requestId);
if (!pending) return;
clearTimeout(pending.timer);
this.pendingTranscriptions.delete(requestId);
pending.resolve(text);
}
private rejectTranscription(requestId: string | undefined, detail: string): void {
if (!requestId) {
this.rejectAllTranscriptions(detail);
return;
}
const pending = this.pendingTranscriptions.get(requestId);
if (!pending) return;
clearTimeout(pending.timer);
this.pendingTranscriptions.delete(requestId);
pending.reject(new Error(detail));
}
private rejectAllTranscriptions(detail: string): void {
for (const [requestId, pending] of this.pendingTranscriptions) {
clearTimeout(pending.timer);
pending.reject(new Error(detail));
this.pendingTranscriptions.delete(requestId);
}
}
private scheduleReconnect(): void {
this.setStatus("reconnecting");
const attempt = this.reconnectAttempts++;

View File

@ -391,6 +391,23 @@ export interface SettingsPayload {
default_api_base?: string | null;
}>;
};
transcription?: {
enabled: boolean;
provider: string;
provider_configured: boolean;
model: string;
language: string | null;
max_duration_sec: number;
max_upload_mb: number;
providers: Array<{
name: string;
label: string;
configured: boolean;
api_key_hint?: string | null;
api_base?: string | null;
default_api_base?: string | null;
}>;
};
runtime: {
config_path: string;
workspace_path: string;
@ -680,6 +697,15 @@ export interface ImageGenerationSettingsUpdate {
maxImagesPerTurn: number;
}
export interface TranscriptionSettingsUpdate {
enabled: boolean;
provider: string;
model: string;
language: string;
maxDurationSec: number;
maxUploadMb: number;
}
export interface SlashCommand {
command: string;
title: string;
@ -782,6 +808,13 @@ export type InboundEvent =
scope?: "metadata" | "thread" | string;
workspace_scope?: WorkspaceScopePayload;
}
| { event: "transcription_result"; request_id: string; text: string }
| {
event: "transcription_error";
request_id?: string;
detail?: string;
provider?: string;
}
| { event: "error"; chat_id?: string; detail?: string; reason?: string };
/** Base64-encoded image attached to an outbound ``message`` envelope.
@ -845,6 +878,7 @@ export type Outbound =
| { type: "new_chat"; workspace_scope?: WorkspaceScopePayload }
| { type: "attach"; chat_id: string }
| { type: "set_workspace_scope"; chat_id: string; workspace_scope: WorkspaceScopePayload }
| { type: "transcribe_audio"; request_id: string; data_url: string; duration_ms?: number }
| {
type: "message";
chat_id: string;

View File

@ -1172,13 +1172,13 @@ describe("App layout", () => {
it("restores the settings section from the URL hash after a page reload", async () => {
mockFetchRoutes({ "/api/settings": baseSettingsPayload() });
window.history.replaceState(null, "", "/#/settings?section=models");
window.history.replaceState(null, "", "/#/settings?section=voice");
render(<App />);
await waitFor(() => expect(connectSpy).toHaveBeenCalled());
expect(await screen.findByRole("heading", { name: "Models" })).toBeInTheDocument();
expect(window.location.hash).toBe("#/settings?section=models");
expect(await screen.findByRole("heading", { name: "Voice input" })).toBeInTheDocument();
expect(window.location.hash).toBe("#/settings?section=voice");
});
it("updates the URL hash when switching settings sections", async () => {
@ -1197,6 +1197,11 @@ describe("App layout", () => {
expect(await screen.findByRole("heading", { name: "Models" })).toBeInTheDocument();
expect(window.location.hash).toBe("#/settings?section=models");
fireEvent.click(within(settingsNav).getByRole("button", { name: "Voice" }));
expect(await screen.findByRole("heading", { name: "Voice input" })).toBeInTheDocument();
expect(window.location.hash).toBe("#/settings?section=voice");
});
it("opens Apps from the main sidebar without replacing the sidebar", async () => {

View File

@ -1,4 +1,5 @@
import { act, render, screen } from "@testing-library/react";
import userEvent from "@testing-library/user-event";
import { describe, expect, it, vi } from "vitest";
import { CodeBlock } from "@/components/CodeBlock";
@ -87,6 +88,64 @@ describe("CodeBlock", () => {
expect(screen.getByText("const value = 1;")).toBeInTheDocument();
});
it("renders ANSI output without mounting the syntax highlighter", () => {
render(
<ThemeProvider theme="dark">
<CodeBlock
language="ansi"
code={"\x1b[32mPASS\x1b[0m <script>alert(1)</script>"}
/>
</ThemeProvider>,
);
expect(screen.queryByTestId("highlighted-code")).not.toBeInTheDocument();
expect(screen.getByTestId("ansi-code")).toBeInTheDocument();
expect(screen.getByTestId("ansi-code").closest(".not-prose")).toBeTruthy();
expect(screen.getByText("ansi")).toBeInTheDocument();
expect(screen.getByText("PASS")).toHaveStyle({ color: "#0dbc79" });
expect(screen.getByText("<script>alert(1)</script>")).toBeInTheDocument();
expect(document.querySelector("script")).toBeNull();
});
it("detects ANSI sequences in regular code blocks", () => {
render(
<ThemeProvider theme="light">
<CodeBlock
language="text"
code={"\x1b[38;2;35;209;139mtruecolor\x1b[0m"}
/>
</ThemeProvider>,
);
expect(screen.queryByTestId("highlighted-code")).not.toBeInTheDocument();
expect(screen.getByText("truecolor")).toHaveStyle({
color: "rgb(35, 209, 139)",
});
});
it("copies ANSI output as clean text", async () => {
const user = userEvent.setup();
const writeText = vi.fn().mockResolvedValue(undefined);
Object.defineProperty(navigator, "clipboard", {
configurable: true,
value: { writeText },
});
try {
render(
<ThemeProvider theme="dark">
<CodeBlock language="ansi" code={"\x1b[32mPASS\x1b[0m"} />
</ThemeProvider>,
);
await user.click(screen.getByRole("button", { name: /copy/i }));
expect(writeText).toHaveBeenCalledWith("PASS");
} finally {
Reflect.deleteProperty(navigator, "clipboard");
}
});
it("reads theme from context without creating per-block observers", async () => {
const originalMutationObserver = globalThis.MutationObserver;
const observer = vi.fn();

View File

@ -412,6 +412,61 @@ describe("NanobotClient", () => {
);
});
it("sends transcription requests and resolves transcription results outside chat dispatch", async () => {
const client = new NanobotClient({
url: "ws://test",
reconnect: false,
socketFactory: (url) => new FakeSocket(url) as unknown as WebSocket,
});
const handler = vi.fn();
client.onChat("chat-a", handler);
client.connect();
lastSocket().fakeOpen();
const promise = client.transcribeAudio("data:audio/webm;base64,AAAA", {
durationMs: 1234,
timeoutMs: 1_000,
});
const frame = JSON.parse(lastSocket().sent.at(-1) as string);
expect(frame).toMatchObject({
type: "transcribe_audio",
data_url: "data:audio/webm;base64,AAAA",
duration_ms: 1234,
});
expect(typeof frame.request_id).toBe("string");
lastSocket().fakeMessage({
event: "transcription_result",
request_id: frame.request_id,
text: "hello from voice",
});
await expect(promise).resolves.toBe("hello from voice");
expect(handler).not.toHaveBeenCalled();
});
it("rejects pending transcription requests on server errors and socket close", async () => {
const client = new NanobotClient({
url: "ws://test",
reconnect: false,
socketFactory: (url) => new FakeSocket(url) as unknown as WebSocket,
});
client.connect();
lastSocket().fakeOpen();
const errored = client.transcribeAudio("data:audio/webm;base64,AAAA", { timeoutMs: 1_000 });
const errorFrame = JSON.parse(lastSocket().sent.at(-1) as string);
lastSocket().fakeMessage({
event: "transcription_error",
request_id: errorFrame.request_id,
detail: "not_configured",
});
await expect(errored).rejects.toThrow("not_configured");
const dropped = client.transcribeAudio("data:audio/webm;base64,BBBB", { timeoutMs: 1_000 });
lastSocket().close();
await expect(dropped).rejects.toThrow("socket closed");
});
it("queues sends while connecting and flushes on open", () => {
const client = new NanobotClient({
url: "ws://test",

View File

@ -1,4 +1,4 @@
import { fireEvent, render, screen, waitFor, within } from "@testing-library/react";
import { act, fireEvent, render, screen, waitFor, within } from "@testing-library/react";
import { afterEach, describe, expect, it, vi } from "vitest";
import { ThreadComposer } from "@/components/thread/ThreadComposer";
@ -121,6 +121,7 @@ const MCP_PRESETS: McpPresetInfo[] = [
},
];
const ORIGINAL_INNER_HEIGHT = window.innerHeight;
const ORIGINAL_MEDIA_DEVICES = navigator.mediaDevices;
function mockBlobUrls() {
Object.defineProperty(URL, "createObjectURL", {
@ -135,7 +136,16 @@ function mockBlobUrls() {
afterEach(() => {
vi.restoreAllMocks();
vi.unstubAllGlobals();
Reflect.deleteProperty(window, "nanobotHost");
if (ORIGINAL_MEDIA_DEVICES) {
Object.defineProperty(navigator, "mediaDevices", {
configurable: true,
value: ORIGINAL_MEDIA_DEVICES,
});
} else {
Reflect.deleteProperty(navigator, "mediaDevices");
}
window.localStorage.clear();
Object.defineProperty(window, "innerHeight", {
value: ORIGINAL_INNER_HEIGHT,
@ -161,6 +171,75 @@ function rect(init: Partial<DOMRect>): DOMRect {
};
}
function mockVoiceRecorder(blob = new Blob(["voice"], { type: "audio/webm" })) {
const stopTrack = vi.fn();
const getUserMedia = vi.fn(async () => ({
getTracks: () => [{ stop: stopTrack }],
}));
Object.defineProperty(navigator, "mediaDevices", {
configurable: true,
value: { getUserMedia },
});
class FakeMediaRecorder {
static isTypeSupported = vi.fn((type: string) => type === "audio/webm");
state: RecordingState = "inactive";
mimeType = blob.type;
ondataavailable: ((event: BlobEvent) => void) | null = null;
onstop: (() => void) | null = null;
start() {
this.state = "recording";
}
stop() {
this.state = "inactive";
this.ondataavailable?.({ data: blob } as BlobEvent);
this.onstop?.();
}
}
vi.stubGlobal("MediaRecorder", FakeMediaRecorder);
return { getUserMedia, stopTrack };
}
function mockVoiceAudioInput(sample = 128, state: AudioContextState = "running") {
class FakeAudioContext {
state = state;
createMediaStreamSource() {
return { connect: vi.fn(), disconnect: vi.fn() };
}
createAnalyser() {
return {
fftSize: 256,
smoothingTimeConstant: 0,
disconnect: vi.fn(),
getByteTimeDomainData: (data: Uint8Array) => data.fill(sample),
};
}
close = vi.fn(async () => undefined);
resume = vi.fn(async () => undefined);
}
vi.stubGlobal("AudioContext", FakeAudioContext);
vi.spyOn(window, "requestAnimationFrame").mockImplementation((callback) =>
window.setTimeout(() => callback(performance.now()), 16) as unknown as number
);
vi.spyOn(window, "cancelAnimationFrame").mockImplementation((id) =>
window.clearTimeout(id as unknown as number)
);
}
async function waitForVoiceCapture(): Promise<void> {
await act(async () => {
await new Promise((resolve) => setTimeout(resolve, 700));
});
}
describe("ThreadComposer", () => {
it("renders a readonly hero model composer when provided", () => {
render(
@ -209,6 +288,245 @@ describe("ThreadComposer", () => {
expect(screen.queryByText(/Enter to send/)).not.toBeInTheDocument();
});
it("transcribes voice input into the composer without sending", async () => {
mockVoiceRecorder();
const onSend = vi.fn();
const onTranscribeAudio = vi.fn(async () => "hello voice");
render(
<ThreadComposer
onSend={onSend}
onTranscribeAudio={onTranscribeAudio}
placeholder="Type your message..."
/>,
);
fireEvent.click(screen.getByRole("button", { name: "Voice input" }));
expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
await waitForVoiceCapture();
fireEvent.click(await screen.findByRole("button", { name: "Stop recording" }));
await waitFor(() => expect(onTranscribeAudio).toHaveBeenCalledWith(
expect.stringMatching(/^data:audio\/webm;base64,/),
expect.objectContaining({ durationMs: expect.any(Number) }),
));
await waitFor(() => expect(screen.getByLabelText("Message input")).toHaveValue("hello voice"));
expect(onSend).not.toHaveBeenCalled();
});
it("does not start duplicate voice recordings while microphone access is pending", async () => {
const { getUserMedia, stopTrack } = mockVoiceRecorder();
let resolveStream: ((stream: MediaStream) => void) | undefined;
getUserMedia.mockImplementation(() => new Promise((resolve) => {
resolveStream = resolve as (stream: MediaStream) => void;
}));
const onTranscribeAudio = vi.fn(async () => "one recording");
render(
<ThreadComposer
onSend={vi.fn()}
onTranscribeAudio={onTranscribeAudio}
placeholder="Type your message..."
/>,
);
const voiceButton = screen.getByRole("button", { name: "Voice input" });
fireEvent.click(voiceButton);
fireEvent.click(voiceButton);
expect(getUserMedia).toHaveBeenCalledTimes(1);
await act(async () => {
resolveStream?.({ getTracks: () => [{ stop: stopTrack }] } as unknown as MediaStream);
});
expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
await waitForVoiceCapture();
fireEvent.click(await screen.findByRole("button", { name: "Stop recording" }));
await waitFor(() => expect(onTranscribeAudio).toHaveBeenCalledTimes(1));
await waitFor(() => expect(screen.getByLabelText("Message input")).toHaveValue("one recording"));
});
it("supports press-and-hold voice recording", async () => {
mockVoiceRecorder();
const onSend = vi.fn();
const onTranscribeAudio = vi.fn(async () => "held voice");
render(
<ThreadComposer
onSend={onSend}
onTranscribeAudio={onTranscribeAudio}
placeholder="Type your message..."
/>,
);
const voiceButton = screen.getByRole("button", { name: "Voice input" });
fireEvent.pointerDown(voiceButton, { button: 0, pointerId: 1, pointerType: "touch" });
await act(async () => {
await new Promise((resolve) => setTimeout(resolve, 180));
});
expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
await waitForVoiceCapture();
fireEvent.pointerUp(screen.getByRole("button", { name: "Stop recording" }), {
pointerId: 1,
pointerType: "touch",
});
await waitFor(() => expect(onTranscribeAudio).toHaveBeenCalled());
await waitFor(() => expect(screen.getByLabelText("Message input")).toHaveValue("held voice"));
expect(onSend).not.toHaveBeenCalled();
});
it("supports keyboard hold voice recording", async () => {
mockVoiceRecorder();
const onSend = vi.fn();
const onTranscribeAudio = vi.fn(async () => "shortcut voice");
render(
<ThreadComposer
onSend={onSend}
onTranscribeAudio={onTranscribeAudio}
placeholder="Type your message..."
/>,
);
const voiceButton = screen.getByRole("button", { name: "Voice input" });
expect(voiceButton).toHaveAttribute("title", "Click to dictate or hold");
expect(voiceButton).toHaveAttribute("aria-keyshortcuts", "Control+Shift+D");
fireEvent.keyDown(window, { code: "KeyD", ctrlKey: true, key: "D", shiftKey: true });
expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
await waitForVoiceCapture();
fireEvent.keyUp(window, { code: "KeyD", ctrlKey: true, key: "D", shiftKey: true });
await waitFor(() => expect(onTranscribeAudio).toHaveBeenCalled());
await waitFor(() => expect(screen.getByLabelText("Message input")).toHaveValue("shortcut voice"));
expect(onSend).not.toHaveBeenCalled();
});
it("ignores the delayed click emitted after a long-press voice recording", async () => {
const { getUserMedia } = mockVoiceRecorder();
const onTranscribeAudio = vi.fn(async () => "held once");
render(
<ThreadComposer
onSend={vi.fn()}
onTranscribeAudio={onTranscribeAudio}
placeholder="Type your message..."
/>,
);
const voiceButton = screen.getByRole("button", { name: "Voice input" });
fireEvent.pointerDown(voiceButton, { button: 0, pointerId: 1, pointerType: "touch" });
await act(async () => {
await new Promise((resolve) => setTimeout(resolve, 180));
});
expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
await waitForVoiceCapture();
fireEvent.pointerUp(screen.getByRole("button", { name: "Stop recording" }), {
pointerId: 1,
pointerType: "touch",
});
await waitFor(() => expect(screen.getByLabelText("Message input")).toHaveValue("held once"));
await act(async () => {
await new Promise((resolve) => setTimeout(resolve, 20));
});
fireEvent.click(screen.getByRole("button", { name: "Voice input" }));
expect(getUserMedia).toHaveBeenCalledTimes(1);
expect(onTranscribeAudio).toHaveBeenCalledTimes(1);
});
it("keeps existing text when voice transcription fails", async () => {
mockVoiceRecorder();
const onSend = vi.fn();
const onTranscribeAudio = vi.fn(async () => {
throw new Error("not_configured");
});
render(
<ThreadComposer
onSend={onSend}
onTranscribeAudio={onTranscribeAudio}
placeholder="Type your message..."
/>,
);
const input = screen.getByLabelText("Message input");
fireEvent.change(input, { target: { value: "draft" } });
fireEvent.click(screen.getByRole("button", { name: "Voice input" }));
await waitForVoiceCapture();
fireEvent.click(await screen.findByRole("button", { name: "Stop recording" }));
await waitFor(() => {
expect(screen.getByText("Configure a transcription provider first.")).toBeInTheDocument();
});
expect(input).toHaveValue("draft");
expect(onSend).not.toHaveBeenCalled();
});
it("does not transcribe recordings that are too short", async () => {
mockVoiceRecorder();
const onTranscribeAudio = vi.fn(async () => "should not appear");
render(
<ThreadComposer
onSend={vi.fn()}
onTranscribeAudio={onTranscribeAudio}
placeholder="Type your message..."
/>,
);
fireEvent.click(screen.getByRole("button", { name: "Voice input" }));
fireEvent.click(await screen.findByRole("button", { name: "Stop recording" }));
await waitFor(() => {
expect(screen.getByText("Hold a little longer to record voice.")).toBeInTheDocument();
});
expect(onTranscribeAudio).not.toHaveBeenCalled();
});
it("warns during recording when microphone input is silent", async () => {
mockVoiceRecorder();
mockVoiceAudioInput();
const onTranscribeAudio = vi.fn(async () => "should not appear");
render(
<ThreadComposer
onSend={vi.fn()}
onTranscribeAudio={onTranscribeAudio}
placeholder="Type your message..."
/>,
);
fireEvent.click(screen.getByRole("button", { name: "Voice input" }));
expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
await act(async () => {
await new Promise((resolve) => setTimeout(resolve, 1_150));
});
expect(screen.getByText("No microphone input detected.")).toBeInTheDocument();
fireEvent.click(await screen.findByRole("button", { name: "Stop recording" }));
expect(onTranscribeAudio).not.toHaveBeenCalled();
});
it("does not treat unavailable microphone levels as silence", async () => {
mockVoiceRecorder();
mockVoiceAudioInput(128, "suspended");
const onTranscribeAudio = vi.fn(async () => "voice text");
render(
<ThreadComposer
onSend={vi.fn()}
onTranscribeAudio={onTranscribeAudio}
placeholder="Type your message..."
/>,
);
fireEvent.click(screen.getByRole("button", { name: "Voice input" }));
expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
await act(async () => {
await new Promise((resolve) => setTimeout(resolve, 1_150));
});
expect(screen.queryByText("No microphone input detected.")).not.toBeInTheDocument();
fireEvent.click(await screen.findByRole("button", { name: "Stop recording" }));
await waitFor(() => expect(onTranscribeAudio).toHaveBeenCalledTimes(1));
expect(screen.getByDisplayValue("voice text")).toBeInTheDocument();
});
it("renders and changes workspace access mode", async () => {
const onWorkspaceScopeChange = vi.fn();
render(