feat(transcription): add shared voice input support (#4232)

* feat(webui): add voice transcription input

* feat(webui): render ANSI output in code blocks

* refactor(webui): isolate voice recorder logic

* refactor(transcription): keep websocket ingress thin

* refactor(transcription): resolve channel audio settings on demand

* style(webui): neutralize voice waveform color

* feat(webui): add voice input tooltip

* feat(webui): add voice input keyboard shortcut

* fix(webui): distinguish voice shortcut platforms

* fix(webui): place voice button after model selector

* refactor(webui): share voice hold recording helpers

* fix(desktop): allow microphone voice input

* fix(webui): stabilize token usage month labels

* feat(webui): show voice input on settings overview

* fix(webui): label voice capability as recognition

* fix(webui): align capability overview status

* refactor(webui): isolate transcription socket handling

* fix(webui): soften silent voice waveform

* refactor(audio): clarify transcription service location

* docs(transcription): clarify audio and provider boundaries

* fix(exec): reduce session output polling flake
This commit is contained in:
Xubin Ren 2026-06-09 01:08:49 +08:00 committed by GitHub
parent 06d454a225
commit 9c81280300
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
49 changed files with 3071 additions and 257 deletions

View File

@ -47,6 +47,9 @@
], ],
"mac": { "mac": {
"category": "public.app-category.developer-tools", "category": "public.app-category.developer-tools",
"extendInfo": {
"NSMicrophoneUsageDescription": "nanobot uses the microphone to transcribe voice input before you send messages."
},
"target": [ "target": [
"dmg" "dmg"
] ]

View File

@ -15,6 +15,7 @@ import {
protocol, protocol,
session, session,
shell, shell,
systemPreferences,
} from "electron"; } from "electron";
import type { IpcMainInvokeEvent, WebContents } from "electron"; import type { IpcMainInvokeEvent, WebContents } from "electron";
@ -100,6 +101,58 @@ function isTrustedAppUrl(rawUrl: string): boolean {
} }
} }
function isTrustedPermissionRequest(
webContents: WebContents | null,
details: unknown,
): boolean {
return [
permissionDetail(details, "requestingUrl"),
permissionDetail(details, "securityOrigin"),
webContents?.getURL(),
].some((url) => typeof url === "string" && isTrustedAppUrl(url));
}
function permissionDetail(details: unknown, key: string): unknown {
return typeof details === "object" && details !== null
? (details as Record<string, unknown>)[key]
: undefined;
}
function isAudioOnlyMediaRequest(details: unknown): boolean {
const mediaTypes = permissionDetail(details, "mediaTypes");
if (Array.isArray(mediaTypes)) {
return mediaTypes.includes("audio") && !mediaTypes.includes("video");
}
return permissionDetail(details, "mediaType") === "audio";
}
async function requestNativeMicrophoneAccess(): Promise<boolean> {
if (process.platform !== "darwin") return true;
const status = systemPreferences.getMediaAccessStatus("microphone");
if (status === "granted") return true;
if (status === "denied" || status === "restricted") return false;
return await systemPreferences.askForMediaAccess("microphone");
}
function registerPermissionHandlers(): void {
session.defaultSession.setPermissionCheckHandler((webContents, permission, _origin, details) => (
permission === "media"
&& isTrustedPermissionRequest(webContents, details)
&& isAudioOnlyMediaRequest(details)
));
session.defaultSession.setPermissionRequestHandler((webContents, permission, callback, details) => {
if (
permission !== "media"
|| !isTrustedPermissionRequest(webContents, details)
|| !isAudioOnlyMediaRequest(details)
) {
callback(false);
return;
}
void requestNativeMicrophoneAccess().then(callback, () => callback(false));
});
}
function assertTrustedIpc(event: IpcMainInvokeEvent): void { function assertTrustedIpc(event: IpcMainInvokeEvent): void {
const frameUrl = event.senderFrame?.url || event.sender.getURL(); const frameUrl = event.senderFrame?.url || event.sender.getURL();
if (!isTrustedAppUrl(frameUrl)) { if (!isTrustedAppUrl(frameUrl)) {
@ -749,6 +802,7 @@ app.whenReady().then(async () => {
} }
registerIpcHandlers(); registerIpcHandlers();
registerPermissionHandlers();
registerAppProtocol(webDist, devUrl); registerAppProtocol(webDist, devUrl);
mainWindow = createWindow(); mainWindow = createWindow();

View File

@ -234,7 +234,7 @@ nanobot channels login <channel_name> --force # re-authenticate
| `_handle_message(sender_id, chat_id, content, media?, metadata?, session_key?)` | **Call this when you receive a message.** Checks `is_allowed()`, then publishes to the bus. Automatically sets `_wants_stream` if `supports_streaming` is true. | | `_handle_message(sender_id, chat_id, content, media?, metadata?, session_key?)` | **Call this when you receive a message.** Checks `is_allowed()`, then publishes to the bus. Automatically sets `_wants_stream` if `supports_streaming` is true. |
| `is_allowed(sender_id)` | Checks against `config.allow_from`; `"*"` allows all, `[]` denies all. | | `is_allowed(sender_id)` | Checks against `config.allow_from`; `"*"` allows all, `[]` denies all. |
| `default_config()` (classmethod) | Returns default config dict for `nanobot onboard`. Override to declare your fields. | | `default_config()` (classmethod) | Returns default config dict for `nanobot onboard`. Override to declare your fields. |
| `transcribe_audio(file_path)` | Transcribes audio via Groq Whisper (if configured). | | `transcribe_audio(file_path)` | Transcribes audio via the shared top-level `transcription` config (if configured). |
| `supports_streaming` (property) | `True` when config has `"streaming": true` **and** subclass overrides `send_delta()`. | | `supports_streaming` (property) | `True` when config has `"streaming": true` **and** subclass overrides `send_delta()`. |
| `is_running` | Returns `self._running`. | | `is_running` | Returns `self._running`. |
| `login(force=False)` | Perform interactive login (e.g. QR code scan). Returns `True` if already authenticated or login succeeds. Override in subclasses that support interactive login. | | `login(force=False)` | Perform interactive login (e.g. QR code scan). Returns `True` if already authenticated or login succeeds. Override in subclasses that support interactive login. |

View File

@ -119,7 +119,7 @@ ANTHROPIC_API_KEY="$(bw get password api/anthropic)" nanobot agent
## Providers ## Providers
> [!TIP] > [!TIP]
> - **Voice transcription**: Voice messages (Telegram, WhatsApp) are automatically transcribed using Whisper. By default Groq is used (free tier). Set `"transcriptionProvider": "openai"` under `channels` to use OpenAI Whisper instead, and optionally set `"transcriptionLanguage": "en"` (or another ISO-639-1 code) for more accurate transcription. The API key is picked from the matching provider config. > - **Voice transcription**: Voice messages and WebUI/desktop microphone input use the shared top-level `transcription` settings. By default Groq Whisper is used; set `transcription.provider` to `"openai"` to use OpenAI Whisper. API keys still live in the matching `providers.<provider>` config.
> - **MiniMax Coding Plan**: Exclusive discount links for the nanobot community: [Overseas](https://platform.minimax.io/subscribe/coding-plan?code=9txpdXw04g&source=link) · [Mainland China](https://platform.minimaxi.com/subscribe/token-plan?code=GILTJpMTqZ&source=link) > - **MiniMax Coding Plan**: Exclusive discount links for the nanobot community: [Overseas](https://platform.minimax.io/subscribe/coding-plan?code=9txpdXw04g&source=link) · [Mainland China](https://platform.minimaxi.com/subscribe/token-plan?code=GILTJpMTqZ&source=link)
> - **MiniMax (Mainland China)**: If your API key is from MiniMax's mainland China platform (minimaxi.com), set `"apiBase": "https://api.minimaxi.com/v1"` in your minimax provider config. > - **MiniMax (Mainland China)**: If your API key is from MiniMax's mainland China platform (minimaxi.com), set `"apiBase": "https://api.minimaxi.com/v1"` in your minimax provider config.
> - **MiniMax thinking mode**: Use `providers.minimaxAnthropic` when you want `reasoningEffort` / thinking mode. MiniMax exposes that capability through its Anthropic-compatible endpoint, so nanobot keeps it as a separate provider instead of guessing MiniMax-specific thinking parameters on the generic OpenAI-compatible `minimax` endpoint. It uses the same `MINIMAX_API_KEY`. Default Anthropic-compatible base URL: `https://api.minimax.io/anthropic`; for mainland China use `https://api.minimaxi.com/anthropic`. > - **MiniMax thinking mode**: Use `providers.minimaxAnthropic` when you want `reasoningEffort` / thinking mode. MiniMax exposes that capability through its Anthropic-compatible endpoint, so nanobot keeps it as a separate provider instead of guessing MiniMax-specific thinking parameters on the generic OpenAI-compatible `minimax` endpoint. It uses the same `MINIMAX_API_KEY`. Default Anthropic-compatible base URL: `https://api.minimax.io/anthropic`; for mainland China use `https://api.minimaxi.com/anthropic`.
@ -1100,6 +1100,61 @@ Set `agents.defaults.modelPreset` to start with a named preset:
When `modelPreset` is `null` or omitted, startup uses the implicit `default` preset from `agents.defaults.*`. Runtime changes made with `/model <preset>` are not written back to `config.json`; they affect future turns until the process restarts or another model/config change replaces them. When `modelPreset` is `null` or omitted, startup uses the implicit `default` preset from `agents.defaults.*`. Runtime changes made with `/model <preset>` are not written back to `config.json`; they affect future turns until the process restarts or another model/config change replaces them.
## Transcription Settings
Audio transcription is a shared capability used by chat-channel voice messages and by WebUI/desktop microphone input. Chat-channel voice messages are transcribed automatically before they enter the agent. WebUI and desktop microphone input is transcribed into the composer first, so you can edit the text before sending.
Configure transcription under the top-level `transcription` section:
```json
{
"transcription": {
"enabled": true,
"provider": "groq",
"model": null,
"language": null,
"maxDurationSec": 120,
"maxUploadMb": 25
}
}
```
| Setting | Default | Description |
|---------|---------|-------------|
| `enabled` | `true` | Enables audio transcription for both chat-channel voice messages and WebUI/desktop microphone input. |
| `provider` | `"groq"` | Transcription backend: `"groq"` or `"openai"`. |
| `model` | provider default | Optional transcription model override. Defaults to `whisper-large-v3` for Groq and `whisper-1` for OpenAI. |
| `language` | `null` | Optional ISO-639 language hint, e.g. `"en"`, `"zh"`, `"ko"`, or `"ja"`. |
| `maxDurationSec` | `120` | Maximum WebUI/desktop recording duration. |
| `maxUploadMb` | `25` | Maximum WebUI/desktop audio upload size. |
Provider and language resolution is intentionally ordered for backwards compatibility:
1. `transcription.provider` / `transcription.language`
2. Legacy `channels.transcriptionProvider` / `channels.transcriptionLanguage`
3. Built-in defaults (`provider: "groq"`, no language hint)
The legacy `channels.*` transcription fields existed before transcription became a shared capability across chat channels and WebUI/desktop microphone input. They are still read so older `config.json` files keep working, but they are no longer the preferred configuration surface. If both old and new fields are present, the top-level `transcription` values are the source of truth.
Transcription credentials are intentionally not stored in `transcription`. Put the API key and optional endpoint in the matching provider config:
```json
{
"providers": {
"groq": {
"apiKey": "gsk-...",
"apiBase": "https://api.groq.com/openai/v1"
}
},
"transcription": {
"provider": "groq",
"language": "zh"
}
}
```
Selecting a transcription provider does not configure credentials by itself. For example, the effective provider may default to Groq for compatibility, but transcription is only usable when `providers.groq.apiKey` or the matching environment-backed config is available. The Settings UI writes only the top-level `transcription` fields.
## Channel Settings ## Channel Settings
Global settings that apply to all channels. Configure under the `channels` section in `~/.nanobot/config.json`: Global settings that apply to all channels. Configure under the `channels` section in `~/.nanobot/config.json`:
@ -1111,8 +1166,6 @@ Global settings that apply to all channels. Configure under the `channels` secti
"sendToolHints": false, "sendToolHints": false,
"extractDocumentText": true, "extractDocumentText": true,
"sendMaxRetries": 3, "sendMaxRetries": 3,
"transcriptionProvider": "groq",
"transcriptionLanguage": null,
"telegram": { ... } "telegram": { ... }
} }
} }
@ -1125,8 +1178,8 @@ Global settings that apply to all channels. Configure under the `channels` secti
| `showReasoning` | `true` | Allow channels to surface model reasoning/thinking content (DeepSeek-R1 `reasoning_content`, Anthropic `thinking_blocks`, inline `<think>` tags). Reasoning flows as a dedicated stream with `_reasoning_delta` / `_reasoning_end` markers — channels override `send_reasoning_delta` / `send_reasoning_end` to render in-place updates. Even with `true`, channels without those overrides stay no-op silently. Currently surfaced on CLI and WebSocket/WebUI (italic shimmer header, auto-collapses after the stream ends); Telegram / Slack / Discord / Feishu / WeChat / Matrix keep the base no-op until their bubble UI is adapted. Independent of `sendProgress`. | | `showReasoning` | `true` | Allow channels to surface model reasoning/thinking content (DeepSeek-R1 `reasoning_content`, Anthropic `thinking_blocks`, inline `<think>` tags). Reasoning flows as a dedicated stream with `_reasoning_delta` / `_reasoning_end` markers — channels override `send_reasoning_delta` / `send_reasoning_end` to render in-place updates. Even with `true`, channels without those overrides stay no-op silently. Currently surfaced on CLI and WebSocket/WebUI (italic shimmer header, auto-collapses after the stream ends); Telegram / Slack / Discord / Feishu / WeChat / Matrix keep the base no-op until their bubble UI is adapted. Independent of `sendProgress`. |
| `extractDocumentText` | `true` | Extract supported document/text attachments into the model prompt. Set to `false` to keep document content out of the prompt and include attachment path references instead. | | `extractDocumentText` | `true` | Extract supported document/text attachments into the model prompt. Set to `false` to keep document content out of the prompt and include attachment path references instead. |
| `sendMaxRetries` | `3` | Max delivery attempts per outbound message, including the initial send (0-10 configured, minimum 1 actual attempt) | | `sendMaxRetries` | `3` | Max delivery attempts per outbound message, including the initial send (0-10 configured, minimum 1 actual attempt) |
| `transcriptionProvider` | `"groq"` | Voice transcription backend: `"groq"` (free tier, default) or `"openai"`. API key and optional `apiBase` are auto-resolved from the matching provider config. Chat-style bases such as `https://api.groq.com/openai/v1` are normalized to the audio transcription endpoint. |
| `transcriptionLanguage` | `null` | Optional ISO-639-1 language hint for audio transcription, e.g. `"en"`, `"ko"`, `"ja"`. | `channels.transcriptionProvider` and `channels.transcriptionLanguage` are deprecated compatibility fields. They remain as a read-only fallback for older configs, but new configuration should use top-level `transcription.provider` and `transcription.language`.
`sendProgress` and `sendToolHints` can also be overridden per channel. The `sendProgress` and `sendToolHints` can also be overridden per channel. The
global values stay as defaults for channels that do not set their own value: global values stay as defaults for channels that do not set their own value:

View File

@ -24,6 +24,7 @@ DEFAULT_WAIT_FOR_MS = 10_000
MAX_WAIT_FOR_MS = 120_000 MAX_WAIT_FOR_MS = 120_000
DEFAULT_MAX_OUTPUT_CHARS = 10_000 DEFAULT_MAX_OUTPUT_CHARS = 10_000
MAX_OUTPUT_CHARS = 50_000 MAX_OUTPUT_CHARS = 50_000
OUTPUT_DRAIN_GRACE_S = 0.1
@dataclass(slots=True) @dataclass(slots=True)
@ -139,6 +140,8 @@ class _ExecSession:
asyncio.gather(self._stdout_task, self._stderr_task), asyncio.gather(self._stdout_task, self._stderr_task),
timeout=2.0, timeout=2.0,
) )
elif yield_time_ms > 0:
await self._wait_for_buffered_output()
async with self._lock: async with self._lock:
output = "".join(self._chunks) output = "".join(self._chunks)
@ -163,6 +166,14 @@ class _ExecSession:
with suppress(asyncio.TimeoutError): with suppress(asyncio.TimeoutError):
await asyncio.wait_for(self.process.wait(), timeout=5.0) await asyncio.wait_for(self.process.wait(), timeout=5.0)
async def _wait_for_buffered_output(self) -> None:
deadline = time.monotonic() + OUTPUT_DRAIN_GRACE_S
while time.monotonic() < deadline:
async with self._lock:
if self._chunks:
return
await asyncio.sleep(0.01)
class ExecSessionManager: class ExecSessionManager:
def __init__(self, *, max_sessions: int = 8, idle_timeout: int = 1800) -> None: def __init__(self, *, max_sessions: int = 8, idle_timeout: int = 1800) -> None:

View File

@ -0,0 +1,2 @@
"""Shared audio service helpers."""

View File

@ -0,0 +1,183 @@
"""Application-level audio transcription service.
This module owns nanobot's transcription behavior: config resolution,
legacy channel fallback, upload validation, temporary-file handling, and
dispatch to provider adapters. It deliberately does not know provider-specific
HTTP details; those live in ``nanobot.providers.transcription``.
"""
from __future__ import annotations
from contextlib import suppress
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Literal
from loguru import logger
from nanobot.config.paths import get_media_dir
from nanobot.utils.media_decode import FileSizeExceeded, save_base64_data_url
TranscriptionProviderName = Literal["groq", "openai"]
_DEFAULT_PROVIDER: TranscriptionProviderName = "groq"
_DEFAULT_MODELS: dict[TranscriptionProviderName, str] = {
"groq": "whisper-large-v3",
"openai": "whisper-1",
}
_MAX_AUDIO_BYTES_FALLBACK = 25 * 1024 * 1024
_AUDIO_MIME_ALLOWED: frozenset[str] = frozenset({
"audio/aac",
"audio/flac",
"audio/m4a",
"audio/mp4",
"audio/mpeg",
"audio/ogg",
"audio/wav",
"audio/webm",
"audio/x-m4a",
"audio/x-wav",
})
@dataclass(frozen=True)
class EffectiveTranscriptionConfig:
enabled: bool
provider: TranscriptionProviderName
model: str
language: str | None
api_key: str = field(repr=False)
api_base: str
max_duration_sec: int
max_upload_mb: int
@property
def configured(self) -> bool:
return bool(self.api_key)
class TranscriptionIngressError(Exception):
"""Stable transcription upload error surfaced to WebUI clients."""
def __init__(self, detail: str, **extra: Any):
super().__init__(detail)
self.detail = detail
self.extra = extra
def _as_provider(value: Any) -> TranscriptionProviderName | None:
if isinstance(value, str):
name = value.strip().lower()
if name in _DEFAULT_MODELS:
return name # type: ignore[return-value]
return None
def _provider_config(config: Any, provider: str) -> Any:
return getattr(getattr(config, "providers", None), provider, None)
def _extract_data_url_mime(url: str) -> str | None:
header, _, _ = url.partition(",")
if not header.startswith("data:") or ";base64" not in header:
return None
return header[5:].split(";", 1)[0].strip().lower() or None
def resolve_transcription_config(config: Any) -> EffectiveTranscriptionConfig:
"""Resolve top-level transcription settings with legacy channel fallback."""
top = getattr(config, "transcription", None)
channels = getattr(config, "channels", None)
provider = (
_as_provider(getattr(top, "provider", None))
or _as_provider(getattr(channels, "transcription_provider", None))
or _DEFAULT_PROVIDER
)
provider_cfg = _provider_config(config, provider)
return EffectiveTranscriptionConfig(
enabled=bool(getattr(top, "enabled", True)),
provider=provider,
model=(getattr(top, "model", None) or _DEFAULT_MODELS[provider]).strip(),
language=getattr(top, "language", None) or getattr(channels, "transcription_language", None),
api_key=getattr(provider_cfg, "api_key", None) or "",
api_base=getattr(provider_cfg, "api_base", None) or "",
max_duration_sec=int(getattr(top, "max_duration_sec", 120)),
max_upload_mb=int(getattr(top, "max_upload_mb", 25)),
)
async def transcribe_audio_data_url(
data_url: Any,
config: EffectiveTranscriptionConfig,
*,
duration_ms: Any = None,
) -> str:
"""Validate, persist, transcribe, and remove a WebUI audio data URL."""
if not isinstance(data_url, str) or not data_url:
raise TranscriptionIngressError("missing_audio")
if not config.enabled:
raise TranscriptionIngressError("disabled")
if not config.configured:
raise TranscriptionIngressError("not_configured", provider=config.provider)
if (
isinstance(duration_ms, (int, float))
and duration_ms > (config.max_duration_sec * 1000 + 1000)
):
raise TranscriptionIngressError("duration")
if _extract_data_url_mime(data_url) not in _AUDIO_MIME_ALLOWED:
raise TranscriptionIngressError("mime")
audio_path: str | None = None
max_bytes = max(
1,
config.max_upload_mb * 1024 * 1024 if config.max_upload_mb else _MAX_AUDIO_BYTES_FALLBACK,
)
try:
audio_path = save_base64_data_url(
data_url,
get_media_dir("webui-transcription"),
max_bytes=max_bytes,
)
except FileSizeExceeded as exc:
raise TranscriptionIngressError("size") from exc
except Exception as exc:
logger.warning("transcription audio decode failed: {}", exc)
if not audio_path:
raise TranscriptionIngressError("decode")
try:
text = await transcribe_audio_file(audio_path, config)
finally:
with suppress(OSError):
Path(audio_path).unlink(missing_ok=True)
if not text:
raise TranscriptionIngressError("empty")
return text
async def transcribe_audio_file(
file_path: str | Path,
config: EffectiveTranscriptionConfig,
) -> str:
"""Transcribe *file_path* using the already-resolved transcription config."""
if not config.enabled or not config.configured:
return ""
if config.provider == "openai":
from nanobot.providers.transcription import OpenAITranscriptionProvider
provider = OpenAITranscriptionProvider(
api_key=config.api_key,
api_base=config.api_base or None,
language=config.language,
model=config.model,
)
else:
from nanobot.providers.transcription import GroqTranscriptionProvider
provider = GroqTranscriptionProvider(
api_key=config.api_key,
api_base=config.api_base or None,
language=config.language,
model=config.model,
)
return await provider.transcribe(file_path)

View File

@ -28,10 +28,6 @@ class BaseChannel(ABC):
name: str = "base" name: str = "base"
display_name: str = "Base" display_name: str = "Base"
transcription_provider: str = "groq"
transcription_api_key: str = ""
transcription_api_base: str = ""
transcription_language: str | None = None
send_progress: bool = True send_progress: bool = True
send_tool_hints: bool = False send_tool_hints: bool = False
show_reasoning: bool = True show_reasoning: bool = True
@ -51,24 +47,14 @@ class BaseChannel(ABC):
async def transcribe_audio(self, file_path: str | Path) -> str: async def transcribe_audio(self, file_path: str | Path) -> str:
"""Transcribe an audio file via Whisper (OpenAI or Groq). Returns empty string on failure.""" """Transcribe an audio file via Whisper (OpenAI or Groq). Returns empty string on failure."""
if not self.transcription_api_key:
return ""
try: try:
if self.transcription_provider == "openai": from nanobot.audio.transcription import (
from nanobot.providers.transcription import OpenAITranscriptionProvider resolve_transcription_config,
provider = OpenAITranscriptionProvider( transcribe_audio_file,
api_key=self.transcription_api_key, )
api_base=self.transcription_api_base or None, from nanobot.config.loader import load_config
language=self.transcription_language or None,
) return await transcribe_audio_file(file_path, resolve_transcription_config(load_config()))
else:
from nanobot.providers.transcription import GroqTranscriptionProvider
provider = GroqTranscriptionProvider(
api_key=self.transcription_api_key,
api_base=self.transcription_api_base or None,
language=self.transcription_language or None,
)
return await provider.transcribe(file_path)
except Exception: except Exception:
self.logger.exception("Audio transcription failed") self.logger.exception("Audio transcription failed")
return "" return ""

View File

@ -80,11 +80,6 @@ class ChannelManager:
"""Initialize channels discovered via pkgutil scan + entry_points plugins.""" """Initialize channels discovered via pkgutil scan + entry_points plugins."""
from nanobot.channels.registry import discover_channel_names, discover_enabled from nanobot.channels.registry import discover_channel_names, discover_enabled
transcription_provider = self.config.channels.transcription_provider
transcription_key = self._resolve_transcription_key(transcription_provider)
transcription_base = self._resolve_transcription_base(transcription_provider)
transcription_language = self.config.channels.transcription_language
# Collect enabled module names first, then only import those. # Collect enabled module names first, then only import those.
# Channel configs live in ChannelsConfig's extra fields (via # Channel configs live in ChannelsConfig's extra fields (via
# extra="allow"), so we enumerate candidates from pkgutil scan # extra="allow"), so we enumerate candidates from pkgutil scan
@ -135,10 +130,6 @@ class ChannelManager:
) )
kwargs["gateway"] = gateway kwargs["gateway"] = gateway
channel = cls(section, self.bus, **kwargs) channel = cls(section, self.bus, **kwargs)
channel.transcription_provider = transcription_provider
channel.transcription_api_key = transcription_key
channel.transcription_api_base = transcription_base
channel.transcription_language = transcription_language
channel.send_progress = self._resolve_bool_override( channel.send_progress = self._resolve_bool_override(
section, "send_progress", self.config.channels.send_progress, section, "send_progress", self.config.channels.send_progress,
) )
@ -155,24 +146,6 @@ class ChannelManager:
self._validate_allow_from() self._validate_allow_from()
def _resolve_transcription_key(self, provider: str) -> str:
"""Pick the API key for the configured transcription provider."""
try:
if provider == "openai":
return self.config.providers.openai.api_key
return self.config.providers.groq.api_key
except AttributeError:
return ""
def _resolve_transcription_base(self, provider: str) -> str:
"""Pick the API base URL for the configured transcription provider."""
try:
if provider == "openai":
return self.config.providers.openai.api_base or ""
return self.config.providers.groq.api_base or ""
except AttributeError:
return ""
def _validate_allow_from(self) -> None: def _validate_allow_from(self) -> None:
for name, ch in self.channels.items(): for name, ch in self.channels.items():
cfg = ch.config cfg = ch.config

View File

@ -45,6 +45,7 @@ from nanobot.webui.http_utils import (
query_first as _query_first, query_first as _query_first,
) )
from nanobot.webui.mcp_presets_api import normalize_mcp_preset_mentions from nanobot.webui.mcp_presets_api import normalize_mcp_preset_mentions
from nanobot.webui.transcription_ws import webui_transcription_event
from nanobot.webui.websocket_logging import websockets_server_logger from nanobot.webui.websocket_logging import websockets_server_logger
@ -235,7 +236,7 @@ _VIDEO_MIME_ALLOWED: frozenset[str] = frozenset({
_UPLOAD_MIME_ALLOWED: frozenset[str] = _IMAGE_MIME_ALLOWED | _VIDEO_MIME_ALLOWED _UPLOAD_MIME_ALLOWED: frozenset[str] = _IMAGE_MIME_ALLOWED | _VIDEO_MIME_ALLOWED
_DATA_URL_MIME_RE = re.compile(r"^data:([^;]+);base64,", re.DOTALL) _DATA_URL_MIME_RE = re.compile(r"^data:([^;,]+)(?:;[^,]*)*;base64,", re.DOTALL)
def _extract_data_url_mime(url: str) -> str | None: def _extract_data_url_mime(url: str) -> str | None:
@ -419,7 +420,6 @@ class WebSocketChannel(BaseChannel):
return None return None
# -- Server lifecycle and connection ingress --------------------------- # -- Server lifecycle and connection ingress ---------------------------
# -- Server lifecycle and connection ingress ---------------------------
async def start(self) -> None: async def start(self) -> None:
from nanobot.utils.logging_bridge import redirect_lib_logging from nanobot.utils.logging_bridge import redirect_lib_logging
@ -703,6 +703,10 @@ class WebSocketChannel(BaseChannel):
workspace_scope=scope.payload(), workspace_scope=scope.payload(),
) )
return return
if t == "transcribe_audio":
event, payload = await webui_transcription_event(envelope)
await self._send_event(connection, event, **payload)
return
if t == "message": if t == "message":
cid = envelope.get("chat_id") cid = envelope.get("chat_id")
content = envelope.get("content") content = envelope.get("content")

View File

@ -39,8 +39,19 @@ class ChannelsConfig(Base):
show_reasoning: bool = True # surface model reasoning when channel implements it show_reasoning: bool = True # surface model reasoning when channel implements it
extract_document_text: bool = True # extract text from document attachments before sending to the model extract_document_text: bool = True # extract text from document attachments before sending to the model
send_max_retries: int = Field(default=3, ge=0, le=10) # Max delivery attempts (initial send included) send_max_retries: int = Field(default=3, ge=0, le=10) # Max delivery attempts (initial send included)
transcription_provider: str = "groq" # Voice transcription backend: "groq" or "openai" transcription_provider: str = "groq" # Deprecated: use top-level transcription.provider
transcription_language: str | None = Field(default=None, pattern=r"^[a-z]{2,3}$") # Optional ISO-639-1 hint for audio transcription transcription_language: str | None = Field(default=None, pattern=r"^[a-z]{2,3}$") # Deprecated: use top-level transcription.language
class TranscriptionConfig(Base):
"""Cross-channel audio transcription configuration."""
enabled: bool = True
provider: Literal["groq", "openai"] | None = None
model: str | None = None
language: str | None = Field(default=None, pattern=r"^[a-z]{2,3}$")
max_duration_sec: int = Field(default=120, ge=1, le=600)
max_upload_mb: int = Field(default=25, ge=1, le=100)
class DreamConfig(Base): class DreamConfig(Base):
@ -167,7 +178,7 @@ class AgentsConfig(Base):
class ProviderConfig(Base): class ProviderConfig(Base):
"""LLM provider configuration.""" """LLM provider configuration."""
api_key: str | None = None api_key: str | None = Field(default=None, repr=False)
api_base: str | None = None api_base: str | None = None
api_type: Literal["auto", "chat_completions", "responses"] = "auto" # Request API surface api_type: Literal["auto", "chat_completions", "responses"] = "auto" # Request API surface
extra_headers: dict[str, str] | None = None # Custom headers (e.g. APP-Code for AiHubMix) extra_headers: dict[str, str] | None = None # Custom headers (e.g. APP-Code for AiHubMix)
@ -312,6 +323,7 @@ class Config(BaseSettings):
agents: AgentsConfig = Field(default_factory=AgentsConfig) agents: AgentsConfig = Field(default_factory=AgentsConfig)
channels: ChannelsConfig = Field(default_factory=ChannelsConfig) channels: ChannelsConfig = Field(default_factory=ChannelsConfig)
transcription: TranscriptionConfig = Field(default_factory=TranscriptionConfig)
providers: ProvidersConfig = Field(default_factory=ProvidersConfig) providers: ProvidersConfig = Field(default_factory=ProvidersConfig)
api: ApiConfig = Field(default_factory=ApiConfig) api: ApiConfig = Field(default_factory=ApiConfig)
gateway: GatewayConfig = Field(default_factory=GatewayConfig) gateway: GatewayConfig = Field(default_factory=GatewayConfig)

View File

@ -1,6 +1,12 @@
"""Voice transcription providers (Groq and OpenAI Whisper).""" """Provider-specific voice transcription adapters.
This module only knows how to call external transcription APIs such as Groq
and OpenAI Whisper. Product-level config fallback, WebUI upload validation,
and channel integration live in ``nanobot.audio.transcription``.
"""
import asyncio import asyncio
import mimetypes
import os import os
from pathlib import Path from pathlib import Path
@ -8,6 +14,15 @@ import httpx
from loguru import logger from loguru import logger
_TRANSCRIPTIONS_PATH = "audio/transcriptions" _TRANSCRIPTIONS_PATH = "audio/transcriptions"
_AUDIO_MIME_OVERRIDES = {
".m4a": "audio/mp4",
".mpga": "audio/mpeg",
".ogg": "audio/ogg",
".opus": "audio/ogg",
".wav": "audio/wav",
".weba": "audio/webm",
".webm": "audio/webm",
}
def _resolve_transcription_url(api_base: str | None, default_url: str) -> str: def _resolve_transcription_url(api_base: str | None, default_url: str) -> str:
@ -26,6 +41,14 @@ def _resolve_transcription_url(api_base: str | None, default_url: str) -> str:
return f"{base}/{_TRANSCRIPTIONS_PATH}" return f"{base}/{_TRANSCRIPTIONS_PATH}"
def _audio_mime_type(path: Path) -> str:
return (
_AUDIO_MIME_OVERRIDES.get(path.suffix.lower())
or mimetypes.guess_type(path.name)[0]
or "application/octet-stream"
)
# Up to 3 retries (4 attempts total) with exponential backoff on transient # Up to 3 retries (4 attempts total) with exponential backoff on transient
# failures. Whisper endpoints occasionally return 502/503 under load, and # failures. Whisper endpoints occasionally return 502/503 under load, and
# mobile-network transcription callers hit sporadic connect/read errors. # mobile-network transcription callers hit sporadic connect/read errors.
@ -71,7 +94,7 @@ async def _post_transcription_with_retry(
async with httpx.AsyncClient() as client: async with httpx.AsyncClient() as client:
for attempt in range(_MAX_RETRIES + 1): for attempt in range(_MAX_RETRIES + 1):
files = { files = {
"file": (path.name, data), "file": (path.name, data, _audio_mime_type(path)),
"model": (None, model), "model": (None, model),
} }
if language: if language:
@ -113,6 +136,16 @@ async def _post_transcription_with_retry(
try: try:
response.raise_for_status() response.raise_for_status()
except httpx.HTTPStatusError:
body = response.text.strip().replace("\n", " ")[:500]
logger.error(
"{} transcription HTTP {}{}{}",
provider_label,
response.status_code,
f" {response.reason_phrase}" if response.reason_phrase else "",
f": {body}" if body else "",
)
return ""
except Exception as e: except Exception as e:
logger.exception("{} transcription error: {}", provider_label, e) logger.exception("{} transcription error: {}", provider_label, e)
return "" return ""
@ -144,6 +177,7 @@ class OpenAITranscriptionProvider:
api_key: str | None = None, api_key: str | None = None,
api_base: str | None = None, api_base: str | None = None,
language: str | None = None, language: str | None = None,
model: str | None = None,
): ):
self.api_key = api_key or os.environ.get("OPENAI_API_KEY") self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
self.api_url = _resolve_transcription_url( self.api_url = _resolve_transcription_url(
@ -151,6 +185,7 @@ class OpenAITranscriptionProvider:
"https://api.openai.com/v1/audio/transcriptions", "https://api.openai.com/v1/audio/transcriptions",
) )
self.language = language or None self.language = language or None
self.model = model or "whisper-1"
logger.debug("OpenAI transcription endpoint: {}", self.api_url) logger.debug("OpenAI transcription endpoint: {}", self.api_url)
async def transcribe(self, file_path: str | Path) -> str: async def transcribe(self, file_path: str | Path) -> str:
@ -165,7 +200,7 @@ class OpenAITranscriptionProvider:
self.api_url, self.api_url,
api_key=self.api_key, api_key=self.api_key,
path=path, path=path,
model="whisper-1", model=self.model,
provider_label="OpenAI", provider_label="OpenAI",
language=self.language, language=self.language,
) )
@ -183,6 +218,7 @@ class GroqTranscriptionProvider:
api_key: str | None = None, api_key: str | None = None,
api_base: str | None = None, api_base: str | None = None,
language: str | None = None, language: str | None = None,
model: str | None = None,
): ):
self.api_key = api_key or os.environ.get("GROQ_API_KEY") self.api_key = api_key or os.environ.get("GROQ_API_KEY")
self.api_url = _resolve_transcription_url( self.api_url = _resolve_transcription_url(
@ -190,6 +226,7 @@ class GroqTranscriptionProvider:
"https://api.groq.com/openai/v1/audio/transcriptions", "https://api.groq.com/openai/v1/audio/transcriptions",
) )
self.language = language or None self.language = language or None
self.model = model or "whisper-large-v3"
logger.debug("Groq transcription endpoint: {}", self.api_url) logger.debug("Groq transcription endpoint: {}", self.api_url)
async def transcribe(self, file_path: str | Path) -> str: async def transcribe(self, file_path: str | Path) -> str:
@ -215,7 +252,7 @@ class GroqTranscriptionProvider:
self.api_url, self.api_url,
api_key=self.api_key, api_key=self.api_key,
path=path, path=path,
model="whisper-large-v3", model=self.model,
provider_label="Groq", provider_label="Groq",
language=self.language, language=self.language,
) )

View File

@ -18,13 +18,30 @@ from nanobot.utils.helpers import safe_filename
DEFAULT_MAX_BYTES = 10 * 1024 * 1024 DEFAULT_MAX_BYTES = 10 * 1024 * 1024
MAX_FILE_SIZE = DEFAULT_MAX_BYTES MAX_FILE_SIZE = DEFAULT_MAX_BYTES
_DATA_URL_RE = re.compile(r"^data:([^;]+);base64,(.+)$", re.DOTALL) _DATA_URL_RE = re.compile(r"^data:([^;,]+)(?:;[^,]*)*;base64,(.+)$", re.DOTALL)
_MIME_EXTENSION_OVERRIDES = {
# Python's ``mimetypes`` maps browser-recorded audio/webm to ``.weba`` and
# audio/ogg to ``.oga`` on macOS. Some transcription APIs validate by the
# file extension and accept the canonical container extensions instead.
"application/ogg": ".ogg",
"audio/ogg": ".ogg",
"audio/mpga": ".mpga",
"audio/wav": ".wav",
"audio/webm": ".webm",
"audio/x-m4a": ".m4a",
"audio/x-wav": ".wav",
"audio/vnd.wave": ".wav",
"video/webm": ".webm",
}
class FileSizeExceeded(Exception): class FileSizeExceededError(Exception):
"""Raised when a decoded payload exceeds the caller's size limit.""" """Raised when a decoded payload exceeds the caller's size limit."""
FileSizeExceeded = FileSizeExceededError
def save_base64_data_url( def save_base64_data_url(
data_url: str, data_url: str,
media_dir: Path, media_dir: Path,
@ -40,7 +57,7 @@ def save_base64_data_url(
m = _DATA_URL_RE.match(data_url) m = _DATA_URL_RE.match(data_url)
if not m: if not m:
return None return None
mime_type, b64_payload = m.group(1), m.group(2) mime_type, b64_payload = m.group(1).strip().lower(), m.group(2)
try: try:
raw = base64.b64decode(b64_payload) raw = base64.b64decode(b64_payload)
except Exception: except Exception:
@ -48,7 +65,7 @@ def save_base64_data_url(
limit = DEFAULT_MAX_BYTES if max_bytes is None else max_bytes limit = DEFAULT_MAX_BYTES if max_bytes is None else max_bytes
if len(raw) > limit: if len(raw) > limit:
raise FileSizeExceeded(f"File exceeds {limit // (1024 * 1024)}MB limit") raise FileSizeExceeded(f"File exceeds {limit // (1024 * 1024)}MB limit")
ext = mimetypes.guess_extension(mime_type) or ".bin" ext = _MIME_EXTENSION_OVERRIDES.get(mime_type) or mimetypes.guess_extension(mime_type) or ".bin"
filename = f"{uuid.uuid4().hex[:12]}{ext}" filename = f"{uuid.uuid4().hex[:12]}{ext}"
dest = media_dir / safe_filename(filename) dest = media_dir / safe_filename(filename)
dest.write_bytes(raw) dest.write_bytes(raw)

View File

@ -15,6 +15,7 @@ from zoneinfo import ZoneInfo
import httpx import httpx
from nanobot.audio.transcription import resolve_transcription_config
from nanobot.config.loader import get_config_path, load_config, save_config from nanobot.config.loader import get_config_path, load_config, save_config
from nanobot.config.schema import ModelPresetConfig from nanobot.config.schema import ModelPresetConfig
from nanobot.providers.image_generation import ( from nanobot.providers.image_generation import (
@ -90,6 +91,7 @@ _IMAGE_GENERATION_ASPECT_RATIOS = {
"2:3", "2:3",
"21:9", "21:9",
} }
_TRANSCRIPTION_PROVIDERS = ("groq", "openai")
_CONTEXT_WINDOW_TOKEN_OPTIONS = {65_536, 262_144} _CONTEXT_WINDOW_TOKEN_OPTIONS = {65_536, 262_144}
_MODEL_CONFIGURATION_SLUG_RE = re.compile(r"[^a-z0-9_-]+") _MODEL_CONFIGURATION_SLUG_RE = re.compile(r"[^a-z0-9_-]+")
_ENV_REF_RE = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}") _ENV_REF_RE = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}")
@ -576,6 +578,22 @@ def _image_generation_provider_rows(config: Any) -> list[dict[str, Any]]:
return rows return rows
def _transcription_provider_rows(config: Any) -> list[dict[str, Any]]:
rows: list[dict[str, Any]] = []
for name in _TRANSCRIPTION_PROVIDERS:
spec = find_by_name(name)
provider_config = getattr(config.providers, name, None)
rows.append({
"name": name,
"label": spec.label if spec is not None else name,
"configured": bool(getattr(provider_config, "api_key", None)),
"api_key_hint": _mask_secret_hint(getattr(provider_config, "api_key", None)),
"api_base": getattr(provider_config, "api_base", None),
"default_api_base": spec.default_api_base if spec and spec.default_api_base else None,
})
return rows
def settings_payload( def settings_payload(
*, *,
requires_restart: bool = False, requires_restart: bool = False,
@ -633,6 +651,7 @@ def settings_payload(
search_config = config.tools.web.search search_config = config.tools.web.search
image_config = config.tools.image_generation image_config = config.tools.image_generation
transcription = resolve_transcription_config(config)
search_provider = ( search_provider = (
search_config.provider search_config.provider
if search_config.provider in _WEB_SEARCH_PROVIDER_BY_NAME if search_config.provider in _WEB_SEARCH_PROVIDER_BY_NAME
@ -733,6 +752,16 @@ def settings_payload(
"save_dir": image_config.save_dir, "save_dir": image_config.save_dir,
"providers": image_providers, "providers": image_providers,
}, },
"transcription": {
"enabled": transcription.enabled,
"provider": transcription.provider,
"provider_configured": transcription.configured,
"model": transcription.model,
"language": transcription.language,
"max_duration_sec": transcription.max_duration_sec,
"max_upload_mb": transcription.max_upload_mb,
"providers": _transcription_provider_rows(config),
},
"runtime": { "runtime": {
"config_path": str(get_config_path().expanduser()), "config_path": str(get_config_path().expanduser()),
"workspace_path": str(config.workspace_path), "workspace_path": str(config.workspace_path),
@ -1311,3 +1340,71 @@ def update_image_generation_settings(query: QueryParams) -> dict[str, Any]:
if changed: if changed:
save_config(config) save_config(config)
return settings_payload(requires_restart=changed) return settings_payload(requires_restart=changed)
def update_transcription_settings(query: QueryParams) -> dict[str, Any]:
config = load_config()
transcription = config.transcription
changed = False
enabled = _query_first(query, "enabled")
if enabled is not None:
parsed_enabled = _parse_bool(enabled, "enabled")
if transcription.enabled != parsed_enabled:
transcription.enabled = parsed_enabled
changed = True
provider = _query_first(query, "provider")
if provider is not None:
provider = provider.strip().lower()
if provider not in _TRANSCRIPTION_PROVIDERS:
raise WebUISettingsError("unknown transcription provider")
if transcription.provider != provider:
transcription.provider = provider # type: ignore[assignment]
changed = True
model = _query_first(query, "model")
if model is not None:
model = model.strip() or None
if model is not None and len(model) > 200:
raise WebUISettingsError("transcription model is too long")
if transcription.model != model:
transcription.model = model
changed = True
language = _query_first(query, "language")
if language is not None:
language = language.strip().lower() or None
if language is not None and not re.fullmatch(r"[a-z]{2,3}", language):
raise WebUISettingsError("transcription language must be 2-3 lowercase letters")
if transcription.language != language:
transcription.language = language
changed = True
max_duration_sec = _query_first_alias(query, "max_duration_sec", "maxDurationSec")
if max_duration_sec is not None:
try:
parsed_duration = int(max_duration_sec)
except ValueError:
raise WebUISettingsError("max_duration_sec must be an integer") from None
if parsed_duration < 1 or parsed_duration > 600:
raise WebUISettingsError("max_duration_sec must be between 1 and 600")
if transcription.max_duration_sec != parsed_duration:
transcription.max_duration_sec = parsed_duration
changed = True
max_upload_mb = _query_first_alias(query, "max_upload_mb", "maxUploadMb")
if max_upload_mb is not None:
try:
parsed_upload = int(max_upload_mb)
except ValueError:
raise WebUISettingsError("max_upload_mb must be an integer") from None
if parsed_upload < 1 or parsed_upload > 100:
raise WebUISettingsError("max_upload_mb must be between 1 and 100")
if transcription.max_upload_mb != parsed_upload:
transcription.max_upload_mb = parsed_upload
changed = True
if changed:
save_config(config)
return settings_payload()

View File

@ -33,6 +33,7 @@ from nanobot.webui.settings_api import (
update_model_configuration, update_model_configuration,
update_network_safety_settings, update_network_safety_settings,
update_provider_settings, update_provider_settings,
update_transcription_settings,
update_web_search_settings, update_web_search_settings,
) )
@ -100,6 +101,8 @@ class WebUISettingsRouter:
return self._handle_settings_web_search_update(request) return self._handle_settings_web_search_update(request)
if path == "/api/settings/image-generation/update": if path == "/api/settings/image-generation/update":
return self._handle_settings_image_generation_update(request) return self._handle_settings_image_generation_update(request)
if path == "/api/settings/transcription/update":
return self._handle_settings_transcription_update(request)
if path == "/api/settings/network-safety/update": if path == "/api/settings/network-safety/update":
return self._handle_settings_network_safety_update(request) return self._handle_settings_network_safety_update(request)
if path == "/api/settings/cli-apps": if path == "/api/settings/cli-apps":
@ -275,6 +278,15 @@ class WebUISettingsRouter:
return self._error_response(e.status, e.message) return self._error_response(e.status, e.message)
return self._json_response(self._with_restart_state(payload, section="image")) return self._json_response(self._with_restart_state(payload, section="image"))
def _handle_settings_transcription_update(self, request: WsRequest) -> Response:
if not self._authorized(request):
return self._unauthorized()
try:
payload = update_transcription_settings(self._query(request))
except WebUISettingsError as e:
return self._error_response(e.status, e.message)
return self._json_response(self._with_restart_state(payload))
def _handle_settings_network_safety_update(self, request: WsRequest) -> Response: def _handle_settings_network_safety_update(self, request: WsRequest) -> Response:
if not self._authorized(request): if not self._authorized(request):
return self._unauthorized() return self._unauthorized()

View File

@ -0,0 +1,46 @@
"""WebUI transcription envelope handling.
The WebSocket channel owns transport and subscription fan-out. This module owns
the WebUI-specific audio transcription action carried over that socket.
"""
from __future__ import annotations
from typing import Any
from nanobot.audio.transcription import (
TranscriptionIngressError,
resolve_transcription_config,
transcribe_audio_data_url,
)
from nanobot.config.loader import load_config
_MAX_REQUEST_ID_LENGTH = 80
async def webui_transcription_event(envelope: dict[str, Any]) -> tuple[str, dict[str, Any]]:
"""Return the WS event name and payload for one WebUI transcription request."""
request_id = envelope.get("request_id")
valid_request_id = (
isinstance(request_id, str)
and 0 < len(request_id) <= _MAX_REQUEST_ID_LENGTH
)
def error(detail: str, **extra: Any) -> tuple[str, dict[str, Any]]:
payload: dict[str, Any] = {"detail": detail, **extra}
if valid_request_id:
payload["request_id"] = request_id
return "transcription_error", payload
if not valid_request_id:
return error("invalid_request")
try:
text = await transcribe_audio_data_url(
envelope.get("data_url"),
resolve_transcription_config(load_config()),
duration_ms=envelope.get("duration_ms"),
)
except TranscriptionIngressError as exc:
return error(exc.detail, **exc.extra)
return "transcription_result", {"request_id": request_id, "text": text}

View File

@ -12,7 +12,8 @@ from nanobot.bus.events import OutboundMessage
from nanobot.bus.queue import MessageBus from nanobot.bus.queue import MessageBus
from nanobot.channels.base import BaseChannel from nanobot.channels.base import BaseChannel
from nanobot.channels.manager import ChannelManager from nanobot.channels.manager import ChannelManager
from nanobot.config.schema import ChannelsConfig from nanobot.config.loader import save_config
from nanobot.config.schema import ChannelsConfig, Config
from nanobot.providers.transcription import GroqTranscriptionProvider as _GroqProvider from nanobot.providers.transcription import GroqTranscriptionProvider as _GroqProvider
from nanobot.providers.transcription import OpenAITranscriptionProvider as _OpenAIProvider from nanobot.providers.transcription import OpenAITranscriptionProvider as _OpenAIProvider
from nanobot.utils.restart import RestartNotice from nanobot.utils.restart import RestartNotice
@ -238,102 +239,103 @@ async def test_manager_loads_plugin_from_dict_config():
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_manager_propagates_groq_transcription_api_base_to_channels(): async def test_base_channel_reads_current_transcription_config_each_call(
from nanobot.channels.manager import ChannelManager tmp_path,
monkeypatch: pytest.MonkeyPatch,
fake_config = SimpleNamespace( ):
channels=ChannelsConfig.model_validate({ """BaseChannel.transcribe_audio resolves config at call time, not manager init time."""
"fakeplugin": {"enabled": True, "allowFrom": ["*"]},
"transcriptionLanguage": "en",
}),
providers=SimpleNamespace(
groq=SimpleNamespace(api_key="groq-key", api_base="http://proxy.local/v1/audio/transcriptions"),
openai=SimpleNamespace(api_key="openai-key", api_base="https://api.openai.com/v1/audio/transcriptions"),
),
)
with patch(
"nanobot.channels.registry.discover_enabled",
return_value={"fakeplugin": _FakePlugin},
):
mgr = ChannelManager.__new__(ChannelManager)
mgr.config = fake_config
mgr.bus = MessageBus()
mgr.channels = {}
mgr._dispatch_task = None
mgr._init_channels()
channel = mgr.channels["fakeplugin"]
assert channel.transcription_provider == "groq"
assert channel.transcription_api_key == "groq-key"
assert channel.transcription_api_base == "http://proxy.local/v1/audio/transcriptions"
assert channel.transcription_language == "en"
@pytest.mark.asyncio
async def test_manager_propagates_openai_transcription_api_base_to_channels():
from nanobot.channels.manager import ChannelManager
fake_config = SimpleNamespace(
channels=ChannelsConfig.model_validate({
"fakeplugin": {"enabled": True, "allowFrom": ["*"]},
"transcriptionProvider": "openai",
}),
providers=SimpleNamespace(
openai=SimpleNamespace(
api_key="openai-key",
api_base="http://proxy.local/v1/audio/transcriptions",
),
groq=SimpleNamespace(api_key="groq-key", api_base=""),
),
)
with patch(
"nanobot.channels.registry.discover_enabled",
return_value={"fakeplugin": _FakePlugin},
):
mgr = ChannelManager.__new__(ChannelManager)
mgr.config = fake_config
mgr.bus = MessageBus()
mgr.channels = {}
mgr._dispatch_task = None
mgr._init_channels()
channel = mgr.channels["fakeplugin"]
assert channel.transcription_provider == "openai"
assert channel.transcription_api_key == "openai-key"
assert channel.transcription_api_base == "http://proxy.local/v1/audio/transcriptions"
@pytest.mark.asyncio
async def test_base_channel_passes_api_base_to_openai_transcription_provider():
"""BaseChannel.transcribe_audio must forward transcription_api_base to OpenAI."""
from nanobot.providers import transcription as transcription_mod from nanobot.providers import transcription as transcription_mod
channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus()) config_path = tmp_path / "config.json"
channel.transcription_provider = "openai" config = Config()
channel.transcription_api_key = "k" config.transcription.provider = "openai"
channel.transcription_api_base = "http://override/v1/audio/transcriptions" config.transcription.model = "whisper-custom"
channel.transcription_language = "en" config.transcription.language = "en"
config.providers.openai.api_key = "openai-key"
config.providers.openai.api_base = "http://openai.local/v1/audio/transcriptions"
save_config(config, config_path)
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
captured: dict[str, object] = {} channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus())
calls: list[dict[str, object]] = []
class _StubOpenAI: class _StubOpenAI:
def __init__(self, api_key=None, api_base=None, language=None): def __init__(self, api_key=None, api_base=None, language=None, model=None):
captured["api_key"] = api_key calls.append({
captured["api_base"] = api_base "provider": "openai",
captured["language"] = language "api_key": api_key,
"api_base": api_base,
"language": language,
"model": model,
})
async def transcribe(self, file_path): async def transcribe(self, file_path):
return "ok" return "openai-ok"
with patch.object(transcription_mod, "OpenAITranscriptionProvider", _StubOpenAI): class _StubGroq:
result = await channel.transcribe_audio("/tmp/does-not-matter.wav") def __init__(self, api_key=None, api_base=None, language=None, model=None):
calls.append({
"provider": "groq",
"api_key": api_key,
"api_base": api_base,
"language": language,
"model": model,
})
assert result == "ok" async def transcribe(self, file_path):
assert captured["api_key"] == "k" return "groq-ok"
assert captured["api_base"] == "http://override/v1/audio/transcriptions"
assert captured["language"] == "en" with (
patch.object(transcription_mod, "OpenAITranscriptionProvider", _StubOpenAI),
patch.object(transcription_mod, "GroqTranscriptionProvider", _StubGroq),
):
assert await channel.transcribe_audio("/tmp/does-not-matter.wav") == "openai-ok"
config.transcription.provider = "groq"
config.transcription.model = "whisper-large-v3-turbo"
config.transcription.language = "ko"
config.providers.groq.api_key = "groq-key"
config.providers.groq.api_base = "http://groq.local/v1/audio/transcriptions"
save_config(config, config_path)
assert await channel.transcribe_audio("/tmp/does-not-matter.wav") == "groq-ok"
assert calls == [
{
"provider": "openai",
"api_key": "openai-key",
"api_base": "http://openai.local/v1/audio/transcriptions",
"language": "en",
"model": "whisper-custom",
},
{
"provider": "groq",
"api_key": "groq-key",
"api_base": "http://groq.local/v1/audio/transcriptions",
"language": "ko",
"model": "whisper-large-v3-turbo",
},
]
@pytest.mark.asyncio
async def test_base_channel_respects_disabled_transcription_config(
tmp_path,
monkeypatch: pytest.MonkeyPatch,
):
config_path = tmp_path / "config.json"
config = Config()
config.transcription.enabled = False
config.providers.groq.api_key = "groq-key"
save_config(config, config_path)
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus())
with patch("nanobot.providers.transcription.GroqTranscriptionProvider") as provider:
assert await channel.transcribe_audio("/tmp/does-not-matter.wav") == ""
provider.assert_not_called()
def test_openai_transcription_provider_honors_api_base_argument(): def test_openai_transcription_provider_honors_api_base_argument():
@ -348,37 +350,6 @@ def test_openai_transcription_provider_honors_api_base_argument():
assert custom.api_url == "http://override/v1/audio/transcriptions" assert custom.api_url == "http://override/v1/audio/transcriptions"
@pytest.mark.asyncio
async def test_base_channel_passes_language_to_groq_transcription_provider():
"""BaseChannel.transcribe_audio must forward transcription_language to Groq."""
from nanobot.providers import transcription as transcription_mod
channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus())
channel.transcription_provider = "groq"
channel.transcription_api_key = "k"
channel.transcription_api_base = "http://override/v1/audio/transcriptions"
channel.transcription_language = "ko"
captured: dict[str, object] = {}
class _StubGroq:
def __init__(self, api_key=None, api_base=None, language=None):
captured["api_key"] = api_key
captured["api_base"] = api_base
captured["language"] = language
async def transcribe(self, file_path):
return "ok"
with patch.object(transcription_mod, "GroqTranscriptionProvider", _StubGroq):
result = await channel.transcribe_audio("/tmp/does-not-matter.wav")
assert result == "ok"
assert captured["api_key"] == "k"
assert captured["api_base"] == "http://override/v1/audio/transcriptions"
assert captured["language"] == "ko"
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Transcription provider HTTP tests # Transcription provider HTTP tests
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------

View File

@ -69,6 +69,7 @@ def _make_channel() -> WebSocketChannel:
[ [
("data:image/png;base64,AAAA", "image/png"), ("data:image/png;base64,AAAA", "image/png"),
("data:image/jpeg;base64,AAAA", "image/jpeg"), ("data:image/jpeg;base64,AAAA", "image/jpeg"),
("data:audio/webm;codecs=opus;base64,AAAA", "audio/webm"),
("data:IMAGE/PNG;base64,AAAA", "image/png"), ("data:IMAGE/PNG;base64,AAAA", "image/png"),
("data:image/svg+xml;base64,AAAA", "image/svg+xml"), ("data:image/svg+xml;base64,AAAA", "image/svg+xml"),
("data:text/plain;base64,AAAA", "text/plain"), ("data:text/plain;base64,AAAA", "text/plain"),

View File

@ -271,8 +271,6 @@ async def test_lid_to_phone_cache_resolves_lid_only_messages():
async def test_voice_message_transcription_uses_media_path(): async def test_voice_message_transcription_uses_media_path():
"""Voice messages are transcribed when media path is available.""" """Voice messages are transcribed when media path is available."""
ch = WhatsAppChannel({"enabled": True, "allowFrom": ["*"]}, MagicMock()) ch = WhatsAppChannel({"enabled": True, "allowFrom": ["*"]}, MagicMock())
ch.transcription_provider = "openai"
ch.transcription_api_key = "sk-test"
ch._handle_message = AsyncMock() ch._handle_message = AsyncMock()
ch.transcribe_audio = AsyncMock(return_value="Hello world") ch.transcribe_audio = AsyncMock(return_value="Hello world")

View File

@ -8,6 +8,8 @@ from unittest.mock import AsyncMock, patch
import httpx import httpx
import pytest import pytest
from nanobot.audio.transcription import resolve_transcription_config
from nanobot.config.schema import Config
from nanobot.providers.transcription import ( from nanobot.providers.transcription import (
GroqTranscriptionProvider, GroqTranscriptionProvider,
OpenAITranscriptionProvider, OpenAITranscriptionProvider,
@ -33,6 +35,65 @@ def _raw_response(status: int, content: bytes) -> httpx.Response:
return httpx.Response(status_code=status, content=content, request=request) return httpx.Response(status_code=status, content=content, request=request)
def test_resolver_uses_legacy_channel_provider_when_top_level_is_unset() -> None:
config = Config()
config.channels.transcription_provider = "openai"
config.channels.transcription_language = "en"
config.providers.openai.api_key = "sk-test"
config.providers.openai.api_base = "https://proxy.example/v1"
resolved = resolve_transcription_config(config)
assert resolved.provider == "openai"
assert resolved.model == "whisper-1"
assert resolved.language == "en"
assert resolved.api_key == "sk-test"
assert resolved.api_base == "https://proxy.example/v1"
assert resolved.configured is True
def test_resolver_prefers_top_level_transcription_over_legacy_channels() -> None:
config = Config()
config.channels.transcription_provider = "openai"
config.channels.transcription_language = "en"
config.transcription.provider = "groq"
config.transcription.model = "whisper-large-v3-turbo"
config.transcription.language = "ko"
config.providers.groq.api_key = "gsk-test"
config.providers.groq.api_base = "https://groq.example/openai/v1"
resolved = resolve_transcription_config(config)
assert resolved.provider == "groq"
assert resolved.model == "whisper-large-v3-turbo"
assert resolved.language == "ko"
assert resolved.api_key == "gsk-test"
assert resolved.api_base == "https://groq.example/openai/v1"
def test_resolved_transcription_repr_hides_api_key() -> None:
config = Config()
config.providers.groq.api_key = "gsk-secret"
resolved = resolve_transcription_config(config)
assert "gsk-secret" not in repr(resolved)
assert "api_key" not in repr(resolved)
def test_resolver_keeps_enabled_and_limits_on_effective_config() -> None:
config = Config()
config.transcription.enabled = False
config.transcription.max_duration_sec = 45
config.transcription.max_upload_mb = 12
resolved = resolve_transcription_config(config)
assert resolved.enabled is False
assert resolved.max_duration_sec == 45
assert resolved.max_upload_mb == 12
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# OpenAI provider — retry on transient HTTP + network errors # OpenAI provider — retry on transient HTTP + network errors
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@ -215,6 +276,32 @@ async def test_provider_omits_language_when_unset(
assert "language" not in files assert "language" not in files
@pytest.mark.asyncio
async def test_provider_forwards_custom_model_in_multipart(audio_file: Path) -> None:
provider = GroqTranscriptionProvider(api_key="k", model="whisper-large-v3-turbo")
post = AsyncMock(return_value=_response(200, {"text": "ok"}))
with patch("httpx.AsyncClient.post", post), patch("asyncio.sleep", AsyncMock()):
result = await provider.transcribe(audio_file)
assert result == "ok"
files = post.await_args_list[0].kwargs["files"]
assert files["model"] == (None, "whisper-large-v3-turbo")
@pytest.mark.asyncio
async def test_provider_forwards_file_mime_type(tmp_path: Path) -> None:
audio = tmp_path / "voice.webm"
audio.write_bytes(b"audio")
provider = GroqTranscriptionProvider(api_key="k")
post = AsyncMock(return_value=_response(200, {"text": "ok"}))
with patch("httpx.AsyncClient.post", post), patch("asyncio.sleep", AsyncMock()):
result = await provider.transcribe(audio)
assert result == "ok"
files = post.await_args_list[0].kwargs["files"]
assert files["file"] == ("voice.webm", b"audio", "audio/webm")
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_language_survives_retry(audio_file: Path) -> None: async def test_language_survives_retry(audio_file: Path) -> None:
"""Regression: language must be present on every retry attempt, not just the first.""" """Regression: language must be present on every retry attempt, not just the first."""

View File

@ -6,8 +6,12 @@ import shlex
import subprocess import subprocess
import sys import sys
from nanobot.agent.tools.exec_session import (
ExecSessionManager,
ListExecSessionsTool,
WriteStdinTool,
)
from nanobot.agent.tools.shell import ExecTool from nanobot.agent.tools.shell import ExecTool
from nanobot.agent.tools.exec_session import ExecSessionManager, ListExecSessionsTool, WriteStdinTool
def _python_command(code: str) -> str: def _python_command(code: str) -> str:
@ -141,7 +145,7 @@ def test_exec_can_continue_with_stdin(tmp_path):
return initial, result return initial, result
initial, result = asyncio.run(run()) initial, result = asyncio.run(run())
assert "ready" in initial assert "ready" in initial + result
assert "Process running" in initial assert "Process running" in initial
assert "Elapsed:" in initial assert "Elapsed:" in initial
assert "got:ping" in result assert "got:ping" in result
@ -170,7 +174,7 @@ def test_write_stdin_can_close_stdin(tmp_path):
return initial, result return initial, result
initial, result = asyncio.run(run()) initial, result = asyncio.run(run())
assert "ready" in initial assert "ready" in initial + result
assert "got:payload" in result assert "got:payload" in result
assert "Stdin closed." in result assert "Stdin closed." in result
assert "Exit code: 0" in result assert "Exit code: 0" in result
@ -185,14 +189,20 @@ def test_write_stdin_can_terminate_session(tmp_path):
"import time; print('ready', flush=True); time.sleep(30)" "import time; print('ready', flush=True); time.sleep(30)"
) )
initial = await exec_tool.execute(command=command, yield_time_ms=500) initial = await exec_tool.execute(command=command, yield_time_ms=100)
sid = _session_id(initial) sid = _session_id(initial)
waited = await stdin_tool.execute(
session_id=sid,
wait_for="ready",
wait_timeout_ms=3000,
yield_time_ms=0,
)
result = await stdin_tool.execute( result = await stdin_tool.execute(
session_id=sid, session_id=sid,
terminate=True, terminate=True,
yield_time_ms=0, yield_time_ms=0,
) )
return initial, result return initial + waited, result
initial, result = asyncio.run(run()) initial, result = asyncio.run(run())
assert "ready" in initial assert "ready" in initial
@ -243,7 +253,7 @@ def test_write_stdin_preserves_completed_session_output_until_polled(tmp_path):
initial, final = asyncio.run(run()) initial, final = asyncio.run(run())
assert "ready" in initial assert "ready" in initial + final
assert "done" in final assert "done" in final
assert "Exit code: 0" in final assert "Exit code: 0" in final

View File

@ -8,8 +8,8 @@ import pytest
from nanobot.utils.media_decode import ( from nanobot.utils.media_decode import (
DEFAULT_MAX_BYTES, DEFAULT_MAX_BYTES,
FileSizeExceeded,
MAX_FILE_SIZE, MAX_FILE_SIZE,
FileSizeExceeded,
save_base64_data_url, save_base64_data_url,
) )
@ -25,6 +25,31 @@ def test_saves_png_with_correct_extension(tmp_path) -> None:
assert (tmp_path / result.split("/")[-1]).read_bytes() == b"fake png" assert (tmp_path / result.split("/")[-1]).read_bytes() == b"fake png"
def test_saves_data_url_with_mime_parameters(tmp_path) -> None:
result = save_base64_data_url(_data_url(b"voice", mime="audio/webm;codecs=opus"), tmp_path)
assert result is not None
assert result.endswith(".webm")
assert (tmp_path / result.split("/")[-1]).read_bytes() == b"voice"
@pytest.mark.parametrize(
("mime", "suffix"),
[
("audio/webm", ".webm"),
("video/webm", ".webm"),
("audio/ogg", ".ogg"),
("audio/wav", ".wav"),
("audio/mpga", ".mpga"),
],
)
def test_saves_common_audio_with_api_friendly_extension(
tmp_path, mime: str, suffix: str
) -> None:
result = save_base64_data_url(_data_url(b"voice", mime=mime), tmp_path)
assert result is not None
assert result.endswith(suffix)
def test_returns_none_for_malformed_data_url(tmp_path) -> None: def test_returns_none_for_malformed_data_url(tmp_path) -> None:
assert save_base64_data_url("not-a-data-url", tmp_path) is None assert save_base64_data_url("not-a-data-url", tmp_path) is None

View File

@ -18,6 +18,7 @@ from nanobot.webui.settings_api import (
update_agent_settings, update_agent_settings,
update_model_configuration, update_model_configuration,
update_network_safety_settings, update_network_safety_settings,
update_transcription_settings,
) )
@ -243,6 +244,75 @@ def test_settings_payload_includes_network_safety_fields(
assert payload["advanced"]["ssrf_whitelist_count"] == 1 assert payload["advanced"]["ssrf_whitelist_count"] == 1
def test_settings_payload_includes_effective_transcription_config(
tmp_path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
config_path = tmp_path / "config.json"
config = Config()
config.channels.transcription_provider = "openai"
config.channels.transcription_language = "en"
config.providers.openai.api_key = "sk-test"
save_config(config, config_path)
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
payload = settings_payload()
assert payload["transcription"]["enabled"] is True
assert payload["transcription"]["provider"] == "openai"
assert payload["transcription"]["provider_configured"] is True
assert payload["transcription"]["model"] == "whisper-1"
assert payload["transcription"]["language"] == "en"
def test_update_transcription_settings_writes_top_level_only(
tmp_path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
config_path = tmp_path / "config.json"
config = Config()
config.channels.transcription_provider = "openai"
config.channels.transcription_language = "en"
config.providers.groq.api_key = "gsk-test"
save_config(config, config_path)
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
payload = update_transcription_settings(
{
"enabled": ["true"],
"provider": ["groq"],
"model": ["whisper-large-v3-turbo"],
"language": ["ko"],
"maxDurationSec": ["90"],
"maxUploadMb": ["20"],
}
)
saved = load_config(config_path)
assert saved.channels.transcription_provider == "openai"
assert saved.channels.transcription_language == "en"
assert saved.transcription.enabled is True
assert saved.transcription.provider == "groq"
assert saved.transcription.model == "whisper-large-v3-turbo"
assert saved.transcription.language == "ko"
assert saved.transcription.max_duration_sec == 90
assert saved.transcription.max_upload_mb == 20
assert payload["transcription"]["provider"] == "groq"
assert payload["transcription"]["provider_configured"] is True
def test_update_transcription_settings_validates_language(
tmp_path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
config_path = tmp_path / "config.json"
save_config(Config(), config_path)
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
with pytest.raises(WebUISettingsError, match="transcription language"):
update_transcription_settings({"language": ["en-US"]})
def test_settings_payload_includes_token_usage_summary( def test_settings_payload_includes_token_usage_summary(
tmp_path, tmp_path,
monkeypatch: pytest.MonkeyPatch, monkeypatch: pytest.MonkeyPatch,

View File

@ -0,0 +1,129 @@
"""Tests for WebUI transcription envelopes carried over the gateway socket."""
from __future__ import annotations
import base64
from pathlib import Path
from typing import Any
import pytest
from nanobot.config.loader import save_config
from nanobot.config.schema import Config
from nanobot.webui.transcription_ws import webui_transcription_event
def _audio_data_url(payload: bytes = b"voice", mime: str = "audio/webm") -> str:
return f"data:{mime};base64,{base64.b64encode(payload).decode('ascii')}"
@pytest.mark.asyncio
async def test_webui_transcribe_audio_rejects_unconfigured_provider(
tmp_path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
config_path = tmp_path / "config.json"
config = Config()
config.transcription.provider = "groq"
save_config(config, config_path)
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
event, payload = await webui_transcription_event({
"request_id": "voice-1",
"data_url": _audio_data_url(),
})
assert event == "transcription_error"
assert payload == {
"request_id": "voice-1",
"detail": "not_configured",
"provider": "groq",
}
@pytest.mark.asyncio
async def test_webui_transcribe_audio_rejects_unsupported_mime(
tmp_path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
config_path = tmp_path / "config.json"
config = Config()
config.transcription.provider = "groq"
config.providers.groq.api_key = "gsk-test"
save_config(config, config_path)
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
event, payload = await webui_transcription_event({
"request_id": "voice-1",
"data_url": _audio_data_url(mime="text/plain"),
})
assert event == "transcription_error"
assert payload["request_id"] == "voice-1"
assert payload["detail"] == "mime"
@pytest.mark.asyncio
async def test_webui_transcribe_audio_rejects_oversized_audio(
tmp_path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
config_path = tmp_path / "config.json"
config = Config()
config.transcription.provider = "groq"
config.transcription.max_upload_mb = 1
config.providers.groq.api_key = "gsk-test"
save_config(config, config_path)
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
monkeypatch.setattr("nanobot.audio.transcription.get_media_dir", lambda _channel=None: tmp_path)
event, payload = await webui_transcription_event({
"request_id": "voice-1",
"data_url": _audio_data_url(payload=b"x" * (1024 * 1024 + 1)),
})
assert event == "transcription_error"
assert payload["request_id"] == "voice-1"
assert payload["detail"] == "size"
@pytest.mark.asyncio
async def test_webui_transcribe_audio_returns_text_and_removes_temp_file(
tmp_path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
config_path = tmp_path / "config.json"
media_dir = tmp_path / "media"
media_dir.mkdir()
config = Config()
config.transcription.provider = "groq"
config.providers.groq.api_key = "gsk-test"
save_config(config, config_path)
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
monkeypatch.setattr(
"nanobot.audio.transcription.get_media_dir",
lambda _channel=None: media_dir,
)
captured_paths: list[Path] = []
async def fake_transcribe_audio_file(path: str | Path, _resolved: Any) -> str:
p = Path(path)
assert p.exists()
captured_paths.append(p)
return "hello voice"
monkeypatch.setattr(
"nanobot.audio.transcription.transcribe_audio_file",
fake_transcribe_audio_file,
)
event, payload = await webui_transcription_event({
"request_id": "voice-1",
"data_url": _audio_data_url(payload=b"webm voice", mime="audio/webm;codecs=opus"),
"duration_ms": 1200,
})
assert event == "transcription_result"
assert payload == {"request_id": "voice-1", "text": "hello voice"}
assert captured_paths
assert not captured_paths[0].exists()

View File

@ -81,6 +81,7 @@ const SETTINGS_SECTION_KEYS: SettingsSectionKey[] = [
"appearance", "appearance",
"models", "models",
"image", "image",
"voice",
"browser", "browser",
"apps", "apps",
"skills", "skills",

View File

@ -1,8 +1,9 @@
import { Suspense, lazy, useCallback, useState } from "react"; import { Suspense, lazy, useCallback, useState, type ReactNode } from "react";
import { Check, Copy } from "lucide-react"; import { Check, Copy } from "lucide-react";
import { useTranslation } from "react-i18next"; import { useTranslation } from "react-i18next";
import { useThemeValue } from "@/hooks/useTheme"; import { useThemeValue } from "@/hooks/useTheme";
import { hasAnsi, parseAnsiSegments, stripAnsi } from "@/lib/ansi";
import { cn } from "@/lib/utils"; import { cn } from "@/lib/utils";
interface CodeBlockProps { interface CodeBlockProps {
@ -36,6 +37,10 @@ const CODE_FONT_STACK = [
"monospace", "monospace",
].join(", "); ].join(", ");
const ANSI_LANGUAGES = new Set(["ansi", "ansi-output"]);
const CODE_SURFACE_LIGHT = "#f4f4f5";
const CODE_SURFACE_DARK = "#27272a";
const LazyHighlightedCode = lazy(async () => { const LazyHighlightedCode = lazy(async () => {
const [ const [
{ default: SyntaxHighlighter }, { default: SyntaxHighlighter },
@ -74,7 +79,11 @@ const LazyHighlightedCode = lazy(async () => {
language={language || "text"} language={language || "text"}
style={transparentTheme} style={transparentTheme}
customStyle={{ customStyle={{
background: chrome === "none" ? "transparent" : undefined, background: chrome === "none"
? "transparent"
: isDark
? CODE_SURFACE_DARK
: CODE_SURFACE_LIGHT,
margin: 0, margin: 0,
padding: chrome === "none" ? "0.75rem 1rem" : "1rem", padding: chrome === "none" ? "0.75rem 1rem" : "1rem",
fontFamily: CODE_FONT_STACK, fontFamily: CODE_FONT_STACK,
@ -83,10 +92,10 @@ const LazyHighlightedCode = lazy(async () => {
tabSize: 2, tabSize: 2,
}} }}
codeTagProps={{ codeTagProps={{
style: chrome === "none" ? { style: {
background: "transparent", background: "transparent",
fontFamily: CODE_FONT_STACK, fontFamily: CODE_FONT_STACK,
} : undefined, },
}} }}
lineNumberStyle={{ lineNumberStyle={{
minWidth: "2.6em", minWidth: "2.6em",
@ -106,14 +115,32 @@ const LazyHighlightedCode = lazy(async () => {
}; };
}); });
function PlainCodeFallback({ function renderPlainText(value: string): ReactNode {
return value;
}
function renderAnsiText(value: string): ReactNode {
return parseAnsiSegments(value).map((segment, index) => (
<span key={index} style={segment.style}>
{segment.text}
</span>
));
}
function CodeTextBlock({
code, code,
chrome, chrome,
showLineNumbers, showLineNumbers,
testId,
className,
renderText = renderPlainText,
}: { }: {
code: string; code: string;
chrome: "default" | "none"; chrome: "default" | "none";
showLineNumbers: boolean; showLineNumbers: boolean;
testId: string;
className?: string;
renderText?: (value: string) => ReactNode;
}) { }) {
const lines = code.split("\n"); const lines = code.split("\n");
return ( return (
@ -121,10 +148,11 @@ function PlainCodeFallback({
className={cn( className={cn(
"m-0 overflow-x-auto p-4 font-mono text-sm leading-[1.6] text-foreground/90", "m-0 overflow-x-auto p-4 font-mono text-sm leading-[1.6] text-foreground/90",
showLineNumbers ? "whitespace-pre" : "whitespace-pre-wrap", showLineNumbers ? "whitespace-pre" : "whitespace-pre-wrap",
chrome === "default" ? "bg-background" : "bg-transparent", chrome === "default" ? "bg-zinc-100 dark:bg-zinc-800" : "bg-transparent",
chrome === "none" && "p-3 text-[13px] leading-[1.55]", chrome === "none" && "p-3 text-[13px] leading-[1.55]",
className,
)} )}
data-testid="plain-code-fallback" data-testid={testId}
> >
<code className="text-inherit"> <code className="text-inherit">
{showLineNumbers ? ( {showLineNumbers ? (
@ -133,16 +161,21 @@ function PlainCodeFallback({
<span className="w-10 shrink-0 select-none pr-4 text-right text-muted-foreground/60"> <span className="w-10 shrink-0 select-none pr-4 text-right text-muted-foreground/60">
{index + 1} {index + 1}
</span> </span>
<span className="whitespace-pre">{line || " "}</span> <span className="whitespace-pre">{renderText(line || " ")}</span>
{index < lines.length - 1 ? "\n" : null} {index < lines.length - 1 ? "\n" : null}
</span> </span>
)) ))
) : code} ) : renderText(code)}
</code> </code>
</pre> </pre>
); );
} }
function shouldRenderAnsi(language: string | undefined, code: string): boolean {
const normalized = language?.trim().toLowerCase();
return Boolean((normalized && ANSI_LANGUAGES.has(normalized)) || hasAnsi(code));
}
export function CodeBlock({ export function CodeBlock({
language, language,
code, code,
@ -156,19 +189,20 @@ export function CodeBlock({
const [copied, setCopied] = useState(false); const [copied, setCopied] = useState(false);
const isDark = useThemeValue() === "dark"; const isDark = useThemeValue() === "dark";
const hasChrome = chrome === "default"; const hasChrome = chrome === "default";
const renderAnsi = shouldRenderAnsi(language, code);
const onCopy = useCallback(() => { const onCopy = useCallback(() => {
if (!navigator.clipboard) return; if (!navigator.clipboard) return;
navigator.clipboard.writeText(code).then(() => { navigator.clipboard.writeText(renderAnsi ? stripAnsi(code) : code).then(() => {
setCopied(true); setCopied(true);
setTimeout(() => setCopied(false), 1_500); setTimeout(() => setCopied(false), 1_500);
}); });
}, [code]); }, [code, renderAnsi]);
return ( return (
<div <div
className={cn( className={cn(
"overflow-hidden", "not-prose overflow-hidden",
hasChrome && "rounded-lg border", hasChrome && "rounded-lg border",
hasChrome && (isDark ? "border-white/10" : "border-black/10"), hasChrome && (isDark ? "border-white/10" : "border-black/10"),
className, className,
@ -177,7 +211,7 @@ export function CodeBlock({
{hasChrome ? ( {hasChrome ? (
<div <div
className={cn( className={cn(
"flex items-center justify-between px-4 py-1.5 text-xs font-medium", "flex items-center justify-between px-4 pb-1.5 pt-2 text-xs font-medium",
isDark isDark
? "bg-zinc-800 text-zinc-300" ? "bg-zinc-800 text-zinc-300"
: "bg-zinc-100 text-zinc-600", : "bg-zinc-100 text-zinc-600",
@ -206,13 +240,22 @@ export function CodeBlock({
</button> </button>
</div> </div>
) : null} ) : null}
{highlight ? ( {renderAnsi ? (
<CodeTextBlock
code={code}
chrome={chrome}
showLineNumbers={showLineNumbers}
testId="ansi-code"
renderText={renderAnsiText}
/>
) : highlight ? (
<Suspense <Suspense
fallback={ fallback={
<PlainCodeFallback <CodeTextBlock
code={code} code={code}
chrome={chrome} chrome={chrome}
showLineNumbers={showLineNumbers} showLineNumbers={showLineNumbers}
testId="plain-code-fallback"
/> />
} }
> >
@ -226,10 +269,11 @@ export function CodeBlock({
/> />
</Suspense> </Suspense>
) : ( ) : (
<PlainCodeFallback <CodeTextBlock
code={code} code={code}
chrome={chrome} chrome={chrome}
showLineNumbers={showLineNumbers} showLineNumbers={showLineNumbers}
testId="plain-code-fallback"
/> />
)} )}
</div> </div>

View File

@ -31,6 +31,7 @@ import {
Layers, Layers,
Loader2, Loader2,
LogOut, LogOut,
Mic,
Moon, Moon,
PlayCircle, PlayCircle,
Plus, Plus,
@ -92,6 +93,7 @@ import {
updateNetworkSafetySettings, updateNetworkSafetySettings,
updateProviderSettings, updateProviderSettings,
updateSettings, updateSettings,
updateTranscriptionSettings,
updateWebSearchSettings, updateWebSearchSettings,
} from "@/lib/api"; } from "@/lib/api";
import { notifyCliAppsChanged } from "@/lib/cli-app-events"; import { notifyCliAppsChanged } from "@/lib/cli-app-events";
@ -115,6 +117,7 @@ import type {
ProviderModelsPayload, ProviderModelsPayload,
SettingsPayload, SettingsPayload,
SkillSummary, SkillSummary,
TranscriptionSettingsUpdate,
WebSearchSettingsUpdate, WebSearchSettingsUpdate,
WebuiDefaultAccessMode, WebuiDefaultAccessMode,
} from "@/lib/types"; } from "@/lib/types";
@ -124,6 +127,7 @@ export type SettingsSectionKey =
| "appearance" | "appearance"
| "models" | "models"
| "image" | "image"
| "voice"
| "browser" | "browser"
| "apps" | "apps"
| "skills" | "skills"
@ -367,6 +371,26 @@ const DEFAULT_IMAGE_GENERATION_FORM: ImageGenerationSettingsUpdate = {
maxImagesPerTurn: 4, maxImagesPerTurn: 4,
}; };
const DEFAULT_TRANSCRIPTION_FORM: TranscriptionSettingsUpdate = {
enabled: true,
provider: "groq",
model: "",
language: "",
maxDurationSec: 120,
maxUploadMb: 25,
};
const DEFAULT_TRANSCRIPTION_SETTINGS: NonNullable<SettingsPayload["transcription"]> = {
enabled: true,
provider: "groq",
provider_configured: false,
model: "whisper-large-v3",
language: null,
max_duration_sec: 120,
max_upload_mb: 25,
providers: [],
};
const DEFAULT_NETWORK_SAFETY_FORM: NetworkSafetySettingsUpdate = { const DEFAULT_NETWORK_SAFETY_FORM: NetworkSafetySettingsUpdate = {
webuiAllowLocalServiceAccess: true, webuiAllowLocalServiceAccess: true,
webuiDefaultAccessMode: "default", webuiDefaultAccessMode: "default",
@ -419,6 +443,18 @@ function imageGenerationFormFromPayload(payload: SettingsPayload): ImageGenerati
}; };
} }
function transcriptionFormFromPayload(payload: SettingsPayload): TranscriptionSettingsUpdate {
const transcription = payload.transcription ?? DEFAULT_TRANSCRIPTION_SETTINGS;
return {
enabled: transcription.enabled,
provider: transcription.provider,
model: transcription.model,
language: transcription.language ?? "",
maxDurationSec: transcription.max_duration_sec,
maxUploadMb: transcription.max_upload_mb,
};
}
function networkSafetyFormFromPayload(payload: SettingsPayload): NetworkSafetySettingsUpdate { function networkSafetyFormFromPayload(payload: SettingsPayload): NetworkSafetySettingsUpdate {
return { return {
webuiAllowLocalServiceAccess: webuiAllowLocalServiceAccess:
@ -479,6 +515,7 @@ export function SettingsView({
const [providerSaving, setProviderSaving] = useState<string | null>(null); const [providerSaving, setProviderSaving] = useState<string | null>(null);
const [webSearchSaving, setWebSearchSaving] = useState(false); const [webSearchSaving, setWebSearchSaving] = useState(false);
const [imageGenerationSaving, setImageGenerationSaving] = useState(false); const [imageGenerationSaving, setImageGenerationSaving] = useState(false);
const [transcriptionSaving, setTranscriptionSaving] = useState(false);
const [networkSafetySaving, setNetworkSafetySaving] = useState(false); const [networkSafetySaving, setNetworkSafetySaving] = useState(false);
const [hostEngineApplying, setHostEngineApplying] = useState(false); const [hostEngineApplying, setHostEngineApplying] = useState(false);
const [error, setError] = useState<string | null>(null); const [error, setError] = useState<string | null>(null);
@ -511,6 +548,9 @@ export function SettingsView({
? imageGenerationFormFromPayload(initialSettings) ? imageGenerationFormFromPayload(initialSettings)
: DEFAULT_IMAGE_GENERATION_FORM, : DEFAULT_IMAGE_GENERATION_FORM,
); );
const [transcriptionForm, setTranscriptionForm] = useState<TranscriptionSettingsUpdate>(
() => initialSettings ? transcriptionFormFromPayload(initialSettings) : DEFAULT_TRANSCRIPTION_FORM,
);
const [networkSafetyForm, setNetworkSafetyForm] = useState<NetworkSafetySettingsUpdate>(() => const [networkSafetyForm, setNetworkSafetyForm] = useState<NetworkSafetySettingsUpdate>(() =>
initialSettings ? networkSafetyFormFromPayload(initialSettings) : DEFAULT_NETWORK_SAFETY_FORM, initialSettings ? networkSafetyFormFromPayload(initialSettings) : DEFAULT_NETWORK_SAFETY_FORM,
); );
@ -543,6 +583,7 @@ export function SettingsView({
setForm(agentDraftFromPayload(payload)); setForm(agentDraftFromPayload(payload));
setWebSearchForm((prev) => webSearchFormFromPayload(payload, prev)); setWebSearchForm((prev) => webSearchFormFromPayload(payload, prev));
setImageGenerationForm(imageGenerationFormFromPayload(payload)); setImageGenerationForm(imageGenerationFormFromPayload(payload));
setTranscriptionForm(transcriptionFormFromPayload(payload));
setNetworkSafetyForm(networkSafetyFormFromPayload(payload)); setNetworkSafetyForm(networkSafetyFormFromPayload(payload));
if (payload.restart_required_sections) { if (payload.restart_required_sections) {
setPendingRestartSections(pendingRestartSectionsFromPayload(payload)); setPendingRestartSections(pendingRestartSectionsFromPayload(payload));
@ -711,6 +752,19 @@ export function SettingsView({
); );
}, [imageGenerationForm, settings]); }, [imageGenerationForm, settings]);
const transcriptionDirty = useMemo(() => {
if (!settings) return false;
const transcription = settings.transcription ?? DEFAULT_TRANSCRIPTION_SETTINGS;
return (
transcriptionForm.enabled !== transcription.enabled ||
transcriptionForm.provider !== transcription.provider ||
transcriptionForm.model !== transcription.model ||
transcriptionForm.language !== (transcription.language ?? "") ||
transcriptionForm.maxDurationSec !== transcription.max_duration_sec ||
transcriptionForm.maxUploadMb !== transcription.max_upload_mb
);
}, [settings, transcriptionForm]);
const networkSafetyDirty = useMemo(() => { const networkSafetyDirty = useMemo(() => {
if (!settings) return false; if (!settings) return false;
const currentLocalServiceAccess = const currentLocalServiceAccess =
@ -913,6 +967,24 @@ export function SettingsView({
} }
}; };
const saveTranscriptionSettings = async () => {
if (!settings || !transcriptionDirty || transcriptionSaving) return;
setTranscriptionSaving(true);
try {
const payload = await updateTranscriptionSettings(token, transcriptionForm);
applyPayload(payload);
if (payload.requires_restart) {
setPendingRestartSections((prev) => ({ ...prev, browser: true }));
}
await maybeRestartHostEngine(payload);
setError(null);
} catch (err) {
setError((err as Error).message);
} finally {
setTranscriptionSaving(false);
}
};
const saveNetworkSafetySettings = async () => { const saveNetworkSafetySettings = async () => {
if (!settings || !networkSafetyDirty || networkSafetySaving) return; if (!settings || !networkSafetyDirty || networkSafetySaving) return;
setNetworkSafetySaving(true); setNetworkSafetySaving(true);
@ -1333,6 +1405,22 @@ export function SettingsView({
requiresRestartPending={pendingRestartSections.image} requiresRestartPending={pendingRestartSections.image}
/> />
); );
case "voice":
return (
<TranscriptionSettings
settings={settings}
form={transcriptionForm}
dirty={transcriptionDirty}
saving={transcriptionSaving}
onChangeForm={setTranscriptionForm}
onSave={saveTranscriptionSettings}
onOpenProviders={() => selectSection("models")}
showBrandLogos={localPrefs.brandLogos}
onRestart={restartViaSettingsSurface}
isRestarting={isRestarting || hostEngineApplying}
requiresRestartPending={pendingRestartSections.browser}
/>
);
case "browser": case "browser":
return ( return (
<WebSettings <WebSettings
@ -1523,6 +1611,7 @@ const SETTINGS_NAV_ITEMS: Array<{ key: SettingsSectionKey; icon: LucideIcon; fal
{ key: "appearance", icon: Palette, fallback: "Appearance" }, { key: "appearance", icon: Palette, fallback: "Appearance" },
{ key: "models", icon: SlidersHorizontal, fallback: "Models" }, { key: "models", icon: SlidersHorizontal, fallback: "Models" },
{ key: "image", icon: ImageIcon, fallback: "Image" }, { key: "image", icon: ImageIcon, fallback: "Image" },
{ key: "voice", icon: Mic, fallback: "Voice" },
{ key: "browser", icon: Globe2, fallback: "Web" }, { key: "browser", icon: Globe2, fallback: "Web" },
{ key: "runtime", icon: Server, fallback: "System" }, { key: "runtime", icon: Server, fallback: "System" },
{ key: "advanced", icon: ShieldCheck, fallback: "Security" }, { key: "advanced", icon: ShieldCheck, fallback: "Security" },
@ -1642,6 +1731,24 @@ function OverviewSettings({
const webStatus = settings.web.enable const webStatus = settings.web.enable
? tx("settings.values.enabled", "Enabled") ? tx("settings.values.enabled", "Enabled")
: tx("settings.values.disabled", "Disabled"); : tx("settings.values.disabled", "Disabled");
const webSearchProvider =
settings.web_search.providers.find((provider) => provider.name === settings.web_search.provider) ??
settings.web_search.providers[0];
const webSearchProviderLabel = providerDisplayLabel(
settings.web_search.providers,
settings.web_search.provider,
);
const webSearchCredentialStatus =
webSearchProvider?.credential === "none"
? tx("settings.byok.webSearch.noCredentialRequired", "No key required")
: webSearchProvider?.credential === "base_url"
? settings.web_search.base_url
? tx("settings.values.configured", "Configured")
: tx("settings.values.notConfigured", "Not configured")
: settings.web_search.api_key_hint
? tx("settings.values.configured", "Configured")
: tx("settings.values.notConfigured", "Not configured");
const webCaption = `${webSearchProviderLabel} · ${webSearchCredentialStatus}`;
const imageStatus = settings.image_generation.enabled const imageStatus = settings.image_generation.enabled
? tx("settings.values.enabled", "Enabled") ? tx("settings.values.enabled", "Enabled")
: tx("settings.values.disabled", "Disabled"); : tx("settings.values.disabled", "Disabled");
@ -1650,6 +1757,15 @@ function OverviewSettings({
? tx("settings.values.configured", "Configured") ? tx("settings.values.configured", "Configured")
: tx("settings.values.notConfigured", "Not configured") : tx("settings.values.notConfigured", "Not configured")
}`; }`;
const transcription = settings.transcription ?? DEFAULT_TRANSCRIPTION_SETTINGS;
const voiceStatus = transcription.enabled
? tx("settings.values.enabled", "Enabled")
: tx("settings.values.disabled", "Disabled");
const voiceCaption = `${providerDisplayLabel(transcription.providers, transcription.provider)} · ${
transcription.provider_configured
? tx("settings.values.configured", "Configured")
: tx("settings.values.notConfigured", "Not configured")
}`;
const isNativeHost = (settings.surface ?? settings.runtime_surface) === "native"; const isNativeHost = (settings.surface ?? settings.runtime_surface) === "native";
const workspaceCaption = shortWorkspacePath(settings.runtime.workspace_path); const workspaceCaption = shortWorkspacePath(settings.runtime.workspace_path);
const runtimeTitle = isNativeHost const runtimeTitle = isNativeHost
@ -1691,8 +1807,8 @@ function OverviewSettings({
icon={Globe2} icon={Globe2}
valueLogoProvider={settings.web_search.provider} valueLogoProvider={settings.web_search.provider}
title={tx("settings.overview.webSearch", "Web search")} title={tx("settings.overview.webSearch", "Web search")}
value={providerDisplayLabel(settings.web_search.providers, settings.web_search.provider)} value={webStatus}
caption={webStatus} caption={webCaption}
showBrandLogos={showBrandLogos} showBrandLogos={showBrandLogos}
onClick={() => onSelectSection("browser")} onClick={() => onSelectSection("browser")}
/> />
@ -1705,6 +1821,15 @@ function OverviewSettings({
showBrandLogos={showBrandLogos} showBrandLogos={showBrandLogos}
onClick={() => onSelectSection("image")} onClick={() => onSelectSection("image")}
/> />
<OverviewListRow
icon={Mic}
valueLogoProvider={transcription.provider}
title={tx("settings.overview.voiceInput", "Voice input")}
value={voiceStatus}
caption={voiceCaption}
showBrandLogos={showBrandLogos}
onClick={() => onSelectSection("voice")}
/>
</SettingsGroup> </SettingsGroup>
</section> </section>
@ -2654,6 +2779,137 @@ function ImageGenerationSettings({
); );
} }
function TranscriptionSettings({
settings,
form,
dirty,
saving,
onChangeForm,
onSave,
onOpenProviders,
showBrandLogos,
onRestart,
isRestarting,
requiresRestartPending,
}: {
settings: SettingsPayload;
form: TranscriptionSettingsUpdate;
dirty: boolean;
saving: boolean;
onChangeForm: Dispatch<SetStateAction<TranscriptionSettingsUpdate>>;
onSave: () => void;
onOpenProviders: () => void;
showBrandLogos: boolean;
onRestart?: () => void;
isRestarting?: boolean;
requiresRestartPending: boolean;
}) {
const { t } = useTranslation();
const tx = (key: string, fallback: string) => t(key, { defaultValue: fallback });
const transcription = settings.transcription ?? DEFAULT_TRANSCRIPTION_SETTINGS;
const selectedProvider =
transcription.providers.find((provider) => provider.name === form.provider) ??
transcription.providers[0];
const providerConfigured = !!selectedProvider?.configured;
return (
<section>
<SettingsSectionTitle>{tx("settings.sections.voiceInput", "Voice input")}</SettingsSectionTitle>
<SettingsGroup>
<SettingsRow
title={tx("settings.rows.transcription", "Transcription")}
description={tx("settings.help.transcription", "Transcribe microphone input before sending it. Chat channel voice messages use the same settings.")}
>
<ToggleButton
checked={form.enabled}
onChange={(enabled) => onChangeForm((prev) => ({ ...prev, enabled }))}
ariaLabel={tx("settings.rows.transcription", "Transcription")}
label={form.enabled ? tx("settings.values.on", "On") : tx("settings.values.off", "Off")}
/>
</SettingsRow>
<SettingsRow
title={tx("settings.rows.transcriptionProvider", "Provider")}
description={tx("settings.help.transcriptionProvider", "Uses the matching provider credentials from Providers.")}
>
<ProviderPicker
providers={transcription.providers}
value={form.provider}
emptyLabel={tx("settings.voice.selectProvider", "Select provider")}
showProviderLogos={showBrandLogos}
onChange={(provider) => onChangeForm((prev) => ({ ...prev, provider }))}
/>
</SettingsRow>
<SettingsRow
title={tx("settings.rows.transcriptionProviderStatus", "Provider status")}
description={tx("settings.help.transcriptionProviderStatus", "API keys stay under providers, not in transcription settings.")}
>
<div className="flex flex-wrap items-center justify-end gap-2">
<StatusPill tone={providerConfigured ? "success" : "neutral"}>
{providerConfigured
? tx("settings.values.configured", "Configured")
: tx("settings.values.notConfigured", "Not configured")}
</StatusPill>
{!providerConfigured ? (
<Button size="sm" variant="outline" onClick={onOpenProviders} className="rounded-full">
{tx("settings.voice.configureProvider", "Configure provider")}
</Button>
) : null}
</div>
</SettingsRow>
<SettingsRow
title={tx("settings.rows.transcriptionModel", "Model")}
description={tx("settings.help.transcriptionModel", "Leave as the resolved default unless your provider needs a custom model id.")}
>
<Input
value={form.model}
onChange={(event) => onChangeForm((prev) => ({ ...prev, model: event.target.value }))}
className="h-8 w-[min(300px,70vw)] rounded-full text-[13px]"
/>
</SettingsRow>
<SettingsRow
title={tx("settings.rows.transcriptionLanguage", "Language")}
description={tx("settings.help.transcriptionLanguage", "Optional ISO-639 hint such as en, zh, ja, or ko.")}
>
<Input
value={form.language}
onChange={(event) => onChangeForm((prev) => ({ ...prev, language: event.target.value }))}
placeholder={tx("settings.voice.languageAuto", "Auto")}
className="h-8 w-[min(180px,60vw)] rounded-full text-[13px]"
/>
</SettingsRow>
<SettingsRow title={tx("settings.rows.voiceLimits", "Limits")}>
<div className="flex flex-wrap justify-end gap-2">
<NumberInput
value={form.maxDurationSec}
min={1}
max={600}
suffix="s"
onChange={(maxDurationSec) => onChangeForm((prev) => ({ ...prev, maxDurationSec }))}
/>
<NumberInput
value={form.maxUploadMb}
min={1}
max={100}
suffix="MB"
onChange={(maxUploadMb) => onChangeForm((prev) => ({ ...prev, maxUploadMb }))}
/>
</div>
</SettingsRow>
<RestartSettingsFooter
dirty={dirty}
saving={saving}
pendingRestart={requiresRestartPending}
dirtyMessage={tx("settings.status.restartAfterSaving", "Save changes, then restart when ready.")}
pendingMessage={tx("settings.status.savedRestartApply", "Saved. Restart when ready.")}
onSave={onSave}
onRestart={onRestart}
isRestarting={isRestarting}
/>
</SettingsGroup>
</section>
);
}
function WebSettings({ function WebSettings({
settings, settings,
form, form,

View File

@ -78,16 +78,13 @@ function buildTokenUsageCalendar(
const today = utcDateFromIsoDay(isoDayInTimeZone(new Date(), timeZone)); const today = utcDateFromIsoDay(isoDayInTimeZone(new Date(), timeZone));
const end = addUtcDays(today, 6 - today.getUTCDay()); const end = addUtcDays(today, 6 - today.getUTCDay());
const start = addUtcDays(end, -(TOKEN_HEATMAP_CELLS - 1)); const start = addUtcDays(end, -(TOKEN_HEATMAP_CELLS - 1));
const seenMonths = new Set<string>();
const monthLabels: TokenUsageMonthLabel[] = []; const monthLabels: TokenUsageMonthLabel[] = [];
const cells = Array.from({ length: TOKEN_HEATMAP_CELLS }, (_, index) => { const cells = Array.from({ length: TOKEN_HEATMAP_CELLS }, (_, index) => {
const date = addUtcDays(start, index); const date = addUtcDays(start, index);
const key = isoDay(date); const key = isoDay(date);
const row = byDate.get(key); const row = byDate.get(key);
const monthKey = key.slice(0, 7); if (date.getUTCDate() === 1) {
if (!seenMonths.has(monthKey)) {
seenMonths.add(monthKey);
monthLabels.push({ monthLabels.push({
label: monthFormatter.format(date), label: monthFormatter.format(date),
column: Math.floor(index / 7) + 1, column: Math.floor(index / 7) + 1,
@ -186,16 +183,12 @@ export function TokenUsageHeatmap({
{tx("settings.usage.shortTitle", "Token Usage")} {tx("settings.usage.shortTitle", "Token Usage")}
</span> </span>
</div> </div>
<div <div className="relative mb-2 h-4 text-[10px] font-normal leading-4 text-muted-foreground/62" aria-hidden>
className="mb-2 grid min-h-4 gap-1.5 text-[10px] font-normal leading-4 text-muted-foreground/62"
style={{ gridTemplateColumns: `repeat(${TOKEN_HEATMAP_COLUMNS}, minmax(0, 1fr))` }}
aria-hidden
>
{monthLabels.map((month) => ( {monthLabels.map((month) => (
<span <span
key={`${month.label}-${month.column}`} key={`${month.label}-${month.column}`}
className="whitespace-nowrap" className="absolute top-0 whitespace-nowrap"
style={{ gridColumnStart: month.column, gridColumnEnd: "span 4" }} style={{ left: `${((month.column - 1) / TOKEN_HEATMAP_COLUMNS) * 100}%` }}
> >
{month.label} {month.label}
</span> </span>

View File

@ -31,6 +31,7 @@ import {
History, History,
ImageIcon, ImageIcon,
Loader2, Loader2,
Mic,
Plus, Plus,
RotateCw, RotateCw,
Shield, Shield,
@ -46,6 +47,12 @@ import {
import { useTranslation } from "react-i18next"; import { useTranslation } from "react-i18next";
import { Button } from "@/components/ui/button"; import { Button } from "@/components/ui/button";
import {
Tooltip,
TooltipContent,
TooltipProvider,
TooltipTrigger,
} from "@/components/ui/tooltip";
import { import {
WorkspaceAccessMenu, WorkspaceAccessMenu,
WorkspaceProjectPicker, WorkspaceProjectPicker,
@ -59,6 +66,7 @@ import {
} from "@/hooks/useAttachedImages"; } from "@/hooks/useAttachedImages";
import { useClipboardAndDrop } from "@/hooks/useClipboardAndDrop"; import { useClipboardAndDrop } from "@/hooks/useClipboardAndDrop";
import type { SendImage, SendOptions } from "@/hooks/useNanobotStream"; import type { SendImage, SendOptions } from "@/hooks/useNanobotStream";
import { useVoiceRecorder, type VoiceRecorderErrorKey } from "@/hooks/useVoiceRecorder";
import type { import type {
CliAppInfo, CliAppInfo,
GoalStateWsPayload, GoalStateWsPayload,
@ -79,6 +87,9 @@ import { cn } from "@/lib/utils";
/** ``<input accept>``: aligned with the server's MIME whitelist. SVG is /** ``<input accept>``: aligned with the server's MIME whitelist. SVG is
* deliberately excluded to avoid an embedded-script XSS surface. */ * deliberately excluded to avoid an embedded-script XSS surface. */
const ACCEPT_ATTR = "image/png,image/jpeg,image/webp,image/gif"; const ACCEPT_ATTR = "image/png,image/jpeg,image/webp,image/gif";
const VOICE_SHORTCUT_CODE = "KeyD";
const VOICE_SHORTCUT_ARIA = "Control+Shift+D";
type VoiceShortcutPlatform = "apple" | "chromeos" | "linux" | "other" | "windows";
function formatBytes(n: number): string { function formatBytes(n: number): string {
if (n < 1024) return `${n} B`; if (n < 1024) return `${n} B`;
@ -86,6 +97,54 @@ function formatBytes(n: number): string {
return `${(n / (1024 * 1024)).toFixed(1)} MB`; return `${(n / (1024 * 1024)).toFixed(1)} MB`;
} }
function isVoiceShortcutDown(event: KeyboardEvent): boolean {
return (
event.code === VOICE_SHORTCUT_CODE
&& event.ctrlKey
&& event.shiftKey
&& !event.altKey
&& !event.metaKey
);
}
function isVoiceShortcutRelease(event: KeyboardEvent): boolean {
return (
event.code === VOICE_SHORTCUT_CODE
|| event.key === "Control"
|| event.key === "Shift"
);
}
function getVoiceShortcutPlatform(): VoiceShortcutPlatform {
if (typeof navigator === "undefined") return "other";
const userAgentData = (navigator as Navigator & { userAgentData?: { platform?: string } })
.userAgentData;
const platform = [
userAgentData?.platform,
navigator.platform,
navigator.userAgent,
].filter(Boolean).join(" ").toLowerCase();
const isIpadPretendingToBeMac =
navigator.platform === "MacIntel" && navigator.maxTouchPoints > 1;
if (isIpadPretendingToBeMac || /mac|iphone|ipad|ipod/.test(platform)) return "apple";
if (/win/.test(platform)) return "windows";
if (/cros/.test(platform)) return "chromeos";
if (/linux|x11|android/.test(platform)) return "linux";
return "other";
}
function getVoiceShortcutLabel(): string {
switch (getVoiceShortcutPlatform()) {
case "apple":
return "⌃⇧D";
case "chromeos":
case "linux":
case "windows":
case "other":
return "Ctrl ⇧ D";
}
}
interface ThreadComposerProps { interface ThreadComposerProps {
onSend: (content: string, images?: SendImage[], options?: SendOptions) => void; onSend: (content: string, images?: SendImage[], options?: SendOptions) => void;
disabled?: boolean; disabled?: boolean;
@ -101,6 +160,7 @@ interface ThreadComposerProps {
cliApps?: CliAppInfo[]; cliApps?: CliAppInfo[];
mcpPresets?: McpPresetInfo[]; mcpPresets?: McpPresetInfo[];
onStop?: () => void; onStop?: () => void;
onTranscribeAudio?: (dataUrl: string, options?: { durationMs?: number }) => Promise<string>;
/** Unix seconds from server; turn elapsed timer above input while set. */ /** Unix seconds from server; turn elapsed timer above input while set. */
runStartedAt?: number | null; runStartedAt?: number | null;
/** Sustained objective for this chat (WebSocket ``goal_state``). */ /** Sustained objective for this chat (WebSocket ``goal_state``). */
@ -138,6 +198,45 @@ const QUEUED_PROMPTS_STORAGE_PREFIX = "nanobot.webui.composerQueuedGuidance.v1:"
const QUEUED_PROMPTS_LIMIT = 20; const QUEUED_PROMPTS_LIMIT = 20;
const QUEUED_PROMPT_MAX_CHARS = 4000; const QUEUED_PROMPT_MAX_CHARS = 4000;
function VoiceRecordingMeter({
ariaLabel,
className,
elapsedLabel,
isHero,
levels,
}: {
ariaLabel: string;
className?: string;
elapsedLabel: string;
isHero: boolean;
levels: number[];
}) {
return (
<div
className={cn(
"flex min-w-0 items-center gap-2 text-neutral-700 dark:text-white",
isHero ? "h-8" : "h-9",
className,
)}
aria-live="polite"
aria-label={ariaLabel}
>
<span className="flex h-5 min-w-0 flex-1 items-center justify-between overflow-hidden" aria-hidden>
{levels.map((height, index) => (
<span
key={index}
className="w-[2px] rounded-full bg-current opacity-85 transition-[height] duration-75 ease-linear motion-reduce:transition-none"
style={{ height }}
/>
))}
</span>
<span className="min-w-[2.1rem] text-right text-[12px] font-medium tabular-nums text-muted-foreground">
{elapsedLabel}
</span>
</div>
);
}
type SlashPalettePlacement = "above" | "below"; type SlashPalettePlacement = "above" | "below";
interface SlashPaletteLayout { interface SlashPaletteLayout {
@ -656,6 +755,7 @@ export function ThreadComposer({
cliApps = [], cliApps = [],
mcpPresets = [], mcpPresets = [],
onStop, onStop,
onTranscribeAudio,
runStartedAt = null, runStartedAt = null,
goalState, goalState,
workspaceScope = null, workspaceScope = null,
@ -685,7 +785,9 @@ export function ThreadComposer({
const wasStreamingRef = useRef(isStreaming); const wasStreamingRef = useRef(isStreaming);
const skipNextQueuedFlushRef = useRef(false); const skipNextQueuedFlushRef = useRef(false);
const skipQueuedPromptPersistRef = useRef(false); const skipQueuedPromptPersistRef = useRef(false);
const voiceShortcutDownRef = useRef(false);
const isHero = variant === "hero"; const isHero = variant === "hero";
const voiceShortcutLabel = useMemo(getVoiceShortcutLabel, []);
const queuedPromptStorageKey = useMemo( const queuedPromptStorageKey = useMemo(
() => queuedPromptsStorageKey(pendingQueueKey), () => queuedPromptsStorageKey(pendingQueueKey),
[pendingQueueKey], [pendingQueueKey],
@ -1026,6 +1128,65 @@ export function ThreadComposer({
}); });
}, []); }, []);
const appendTranscription = useCallback((text: string) => {
const transcript = text.trim();
if (!transcript) return;
setValue((current) => {
if (!current.trim()) return transcript;
const separator = /[\s\n]$/.test(current) ? "" : " ";
return `${current}${separator}${transcript}`;
});
setSlashMenuDismissed(false);
setCliAppMenuDismissed(false);
setInlineError(null);
resizeTextarea();
}, [resizeTextarea]);
const clearInlineError = useCallback(() => setInlineError(null), []);
const setVoiceError = useCallback((key: VoiceRecorderErrorKey) => {
setInlineError(t(`thread.composer.voiceErrors.${key}`));
}, [t]);
const voiceRecorder = useVoiceRecorder({
disabled,
onClearError: clearInlineError,
onError: setVoiceError,
onTranscript: appendTranscription,
onTranscribeAudio,
});
useEffect(() => {
if (!onTranscribeAudio) return;
function onKeyDown(event: KeyboardEvent): void {
if (!isVoiceShortcutDown(event) || event.repeat || voiceShortcutDownRef.current) return;
event.preventDefault();
voiceShortcutDownRef.current = true;
voiceRecorder.beginShortcutHold();
}
function onKeyUp(event: KeyboardEvent): void {
if (!voiceShortcutDownRef.current || !isVoiceShortcutRelease(event)) return;
event.preventDefault();
voiceShortcutDownRef.current = false;
voiceRecorder.endShortcutHold();
}
function onWindowBlur(): void {
if (!voiceShortcutDownRef.current) return;
voiceShortcutDownRef.current = false;
voiceRecorder.endShortcutHold();
}
window.addEventListener("keydown", onKeyDown);
window.addEventListener("keyup", onKeyUp);
window.addEventListener("blur", onWindowBlur);
return () => {
window.removeEventListener("keydown", onKeyDown);
window.removeEventListener("keyup", onKeyUp);
window.removeEventListener("blur", onWindowBlur);
};
}, [onTranscribeAudio, voiceRecorder.beginShortcutHold, voiceRecorder.endShortcutHold]);
const chooseSlashCommand = useCallback( const chooseSlashCommand = useCallback(
(command: SlashCommand) => { (command: SlashCommand) => {
if (command.command === "/stop" && isStreaming && onStop) { if (command.command === "/stop" && isStreaming && onStop) {
@ -1341,6 +1502,23 @@ export function ThreadComposer({
); );
const attachButtonDisabled = disabled || full; const attachButtonDisabled = disabled || full;
const showVoiceButton = Boolean(onTranscribeAudio);
const voiceRecordingStatusLabel = t("thread.composer.voice.recordingStatus", {
time: voiceRecorder.elapsedLabel,
defaultValue: `Recording ${voiceRecorder.elapsedLabel}`,
});
const voiceButtonLabel =
voiceRecorder.state === "recording"
? t("thread.composer.voice.stop")
: voiceRecorder.state === "transcribing"
? t("thread.composer.voice.transcribing")
: t("thread.composer.tools.voice");
const voiceButtonTooltip =
voiceRecorder.state === "recording"
? t("thread.composer.voice.stop")
: voiceRecorder.state === "transcribing"
? t("thread.composer.voice.transcribing")
: t("thread.composer.voice.hint");
const showStopButton = isStreaming && !!onStop; const showStopButton = isStreaming && !!onStop;
const relaxedHeroInput = isHero && images.length === 0 && !isStreaming; const relaxedHeroInput = isHero && images.length === 0 && !isStreaming;
const inputTextClasses = cn( const inputTextClasses = cn(
@ -1531,7 +1709,15 @@ export function ThreadComposer({
> >
<Plus className={cn(isHero ? "h-[18px] w-[18px]" : "h-4 w-4")} /> <Plus className={cn(isHero ? "h-[18px] w-[18px]" : "h-4 w-4")} />
</Button> </Button>
{workspaceScope ? ( {voiceRecorder.isRecording ? (
<VoiceRecordingMeter
ariaLabel={voiceRecordingStatusLabel}
className="mx-1 flex-1"
elapsedLabel={voiceRecorder.elapsedLabel}
isHero={isHero}
levels={voiceRecorder.levels}
/>
) : workspaceScope ? (
<WorkspaceAccessMenu <WorkspaceAccessMenu
scope={workspaceScope} scope={workspaceScope}
disabled={disabled || workspaceScopeDisabled} disabled={disabled || workspaceScopeDisabled}
@ -1542,7 +1728,7 @@ export function ThreadComposer({
) : null} ) : null}
</div> </div>
<div className={cn("flex shrink-0 items-center", isHero ? "gap-1.5" : "gap-2")}> <div className={cn("flex shrink-0 items-center", isHero ? "gap-1.5" : "gap-2")}>
{modelLabel ? ( {modelLabel && !voiceRecorder.isRecording ? (
<ComposerModelBadge <ComposerModelBadge
label={modelLabel} label={modelLabel}
provider={modelProvider} provider={modelProvider}
@ -1552,6 +1738,53 @@ export function ThreadComposer({
onClick={modelNeedsSetup ? onModelBadgeClick : undefined} onClick={modelNeedsSetup ? onModelBadgeClick : undefined}
/> />
) : null} ) : null}
{showVoiceButton ? (
<TooltipProvider delayDuration={220} skipDelayDuration={80}>
<Tooltip>
<TooltipTrigger asChild>
<Button
type="button"
size="icon"
variant="ghost"
disabled={voiceRecorder.buttonDisabled}
aria-label={voiceButtonLabel}
aria-keyshortcuts={VOICE_SHORTCUT_ARIA}
title={voiceButtonTooltip}
onPointerDown={voiceRecorder.beginPress}
onPointerUp={voiceRecorder.endPress}
onPointerCancel={voiceRecorder.endPress}
onClick={voiceRecorder.handleClick}
className={cn(
"rounded-full border border-transparent text-muted-foreground hover:bg-muted/65 hover:text-foreground",
isHero ? "h-8 w-8" : "h-9 w-9",
voiceRecorder.isRecording &&
"bg-red-500 text-white shadow-[0_8px_20px_rgba(239,68,68,0.22)] hover:bg-red-500 hover:text-white",
)}
>
{voiceRecorder.state === "transcribing" ? (
<Loader2 className={cn(isHero ? "h-4 w-4" : "h-4 w-4", "animate-spin")} />
) : voiceRecorder.isRecording ? (
<Square className={cn(isHero ? "h-3.5 w-3.5" : "h-3.5 w-3.5")} fill="currentColor" />
) : (
<Mic className={cn(isHero ? "h-4 w-4" : "h-4 w-4")} />
)}
</Button>
</TooltipTrigger>
<TooltipContent
side="top"
align="center"
className="flex items-center gap-2 rounded-full border border-border/70 bg-background px-3 py-1.5 text-[13px] font-medium text-foreground shadow-[0_8px_24px_rgba(15,23,42,0.13)] dark:border-white/10 dark:bg-neutral-900 dark:text-white"
>
<span>{voiceButtonTooltip}</span>
{voiceRecorder.state === "idle" ? (
<kbd className="rounded-full bg-muted px-2 py-0.5 font-sans text-[12px] font-semibold leading-none text-muted-foreground dark:bg-white/10 dark:text-white/80">
{voiceShortcutLabel}
</kbd>
) : null}
</TooltipContent>
</Tooltip>
</TooltipProvider>
) : null}
<Button <Button
type={showStopButton || modelNeedsSetup ? "button" : "submit"} type={showStopButton || modelNeedsSetup ? "button" : "submit"}
size="icon" size="icon"

View File

@ -302,6 +302,7 @@ export function ThreadShell({
runStartedAt, runStartedAt,
goalState, goalState,
send, send,
transcribeAudio,
stop, stop,
setMessages, setMessages,
streamError, streamError,
@ -642,6 +643,7 @@ export function ThreadShell({
cliApps={cliApps} cliApps={cliApps}
mcpPresets={mcpPresets} mcpPresets={mcpPresets}
onStop={stop} onStop={stop}
onTranscribeAudio={transcribeAudio}
runStartedAt={runStartedAt} runStartedAt={runStartedAt}
goalState={goalState} goalState={goalState}
workspaceScope={workspaceScope} workspaceScope={workspaceScope}
@ -672,6 +674,7 @@ export function ThreadShell({
cliApps={cliApps} cliApps={cliApps}
mcpPresets={mcpPresets} mcpPresets={mcpPresets}
runStartedAt={runStartedAt} runStartedAt={runStartedAt}
onTranscribeAudio={transcribeAudio}
goalState={goalState} goalState={goalState}
workspaceScope={workspaceScope} workspaceScope={workspaceScope}
workspaceDefaultScope={workspaceDefaultScope} workspaceDefaultScope={workspaceDefaultScope}

View File

@ -438,6 +438,7 @@ export function useNanobotStream(
/** Latest sustained goal for this ``chatId`` (``goal_state`` WS events). */ /** Latest sustained goal for this ``chatId`` (``goal_state`` WS events). */
goalState: GoalStateWsPayload | undefined; goalState: GoalStateWsPayload | undefined;
send: (content: string, images?: SendImage[], options?: SendOptions) => void; send: (content: string, images?: SendImage[], options?: SendOptions) => void;
transcribeAudio: (dataUrl: string, options?: { durationMs?: number }) => Promise<string>;
stop: () => void; stop: () => void;
setMessages: React.Dispatch<React.SetStateAction<UIMessage[]>>; setMessages: React.Dispatch<React.SetStateAction<UIMessage[]>>;
/** Latest transport-level fault raised since the last ``dismissStreamError``. /** Latest transport-level fault raised since the last ``dismissStreamError``.
@ -1089,12 +1090,19 @@ export function useNanobotStream(
client.sendMessage(chatId, "/stop"); client.sendMessage(chatId, "/stop");
}, [chatId, clearActivitySegment, client, flushPendingStreamEvents]); }, [chatId, clearActivitySegment, client, flushPendingStreamEvents]);
const transcribeAudio = useCallback(
(dataUrl: string, options?: { durationMs?: number }) =>
client.transcribeAudio(dataUrl, options),
[client],
);
return { return {
messages, messages,
isStreaming, isStreaming,
runStartedAt, runStartedAt,
goalState, goalState,
send, send,
transcribeAudio,
stop, stop,
setMessages, setMessages,
streamError, streamError,

View File

@ -0,0 +1,422 @@
import {
useCallback,
useEffect,
useRef,
useState,
type PointerEvent as ReactPointerEvent,
} from "react";
const VOICE_RECORDING_MAX_MS = 120_000;
const VOICE_RECORDING_MIN_MS = 650;
const VOICE_NO_INPUT_HINT_MS = 1_100;
const VOICE_HOLD_START_MS = 140;
const VOICE_WAVEFORM_BAR_COUNT = 64;
const VOICE_WAVEFORM_SILENT_HEIGHT = 3;
const VOICE_WAVEFORM_MIN_HEIGHT = 7;
const VOICE_WAVEFORM_MAX_HEIGHT = 34;
const VOICE_MIN_LEVEL = 0.018;
const VOICE_WAVEFORM_IDLE_LEVELS = Array.from(
{ length: VOICE_WAVEFORM_BAR_COUNT },
() => VOICE_WAVEFORM_SILENT_HEIGHT,
);
const VOICE_MIME_CANDIDATES = [
"audio/webm;codecs=opus",
"audio/webm",
"audio/mp4",
"audio/ogg;codecs=opus",
] as const;
export type VoiceRecorderState = "idle" | "recording" | "transcribing";
export type VoiceRecorderErrorKey =
| "failed"
| "noInput"
| "notConfigured"
| "permission"
| "tooLong"
| "tooShort"
| "unsupported";
interface VoiceRecorderOptions {
disabled?: boolean;
onClearError: () => void;
onError: (key: VoiceRecorderErrorKey) => void;
onTranscript: (text: string) => void;
onTranscribeAudio?: (dataUrl: string, options?: { durationMs?: number }) => Promise<string>;
}
export function useVoiceRecorder({
disabled,
onClearError,
onError,
onTranscript,
onTranscribeAudio,
}: VoiceRecorderOptions) {
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
const chunksRef = useRef<BlobPart[]>([]);
const streamRef = useRef<MediaStream | null>(null);
const audioRef = useRef<VoiceAudioState | null>(null);
const startedAtRef = useRef(0);
const maxTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
const inputHintTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
const holdTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
const holdActiveRef = useRef(false);
const startPendingRef = useRef(false);
const stopAfterStartRef = useRef(false);
const suppressClickRef = useRef(false);
const suppressClickTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
const shortcutActiveRef = useRef(false);
const levelObservedRef = useRef(false);
const peakLevelRef = useRef(0);
const levelReliableRef = useRef(false);
const noInputHintVisibleRef = useRef(false);
const [state, setState] = useState<VoiceRecorderState>("idle");
const [elapsedMs, setElapsedMs] = useState(0);
const [levels, setLevels] = useState<number[]>(VOICE_WAVEFORM_IDLE_LEVELS);
const clearInputHintTimer = useCallback(() => clearTimer(inputHintTimerRef), []);
const clearSuppressClickTimer = useCallback(() => clearTimer(suppressClickTimerRef), []);
const suppressNextClick = useCallback(() => {
clearSuppressClickTimer();
suppressClickRef.current = true;
suppressClickTimerRef.current = setTimeout(() => {
suppressClickRef.current = false;
suppressClickTimerRef.current = null;
}, 500);
}, [clearSuppressClickTimer]);
const stopWaveform = useCallback(() => {
const audio = audioRef.current;
audioRef.current = null;
if (!audio) return;
if (audio.frame !== null) cancelAnimationFrame(audio.frame);
audio.source.disconnect();
audio.analyser.disconnect();
void audio.context.close().catch(() => undefined);
}, []);
const startWaveform = useCallback((stream: MediaStream) => {
const AudioContextCtor = audioContextConstructor();
if (!AudioContextCtor) return;
stopWaveform();
setLevels(VOICE_WAVEFORM_IDLE_LEVELS);
try {
const context = new AudioContextCtor();
const source = context.createMediaStreamSource(stream);
const analyser = context.createAnalyser();
analyser.fftSize = 256;
analyser.smoothingTimeConstant = 0.68;
source.connect(analyser);
const audio: VoiceAudioState = {
analyser,
context,
data: new Uint8Array(analyser.fftSize),
frame: null,
source,
};
const tick = () => {
const current = audioRef.current;
if (!current) return;
if (current.context.state !== "running") {
void current.context.resume().catch(() => undefined);
current.frame = requestAnimationFrame(tick);
return;
}
current.analyser.getByteTimeDomainData(current.data);
const level = voiceLevelFromSamples(current.data);
levelReliableRef.current = true;
levelObservedRef.current = true;
peakLevelRef.current = Math.max(peakLevelRef.current, level);
if (level >= VOICE_MIN_LEVEL) {
clearInputHintTimer();
if (noInputHintVisibleRef.current) {
noInputHintVisibleRef.current = false;
onClearError();
}
}
setLevels((currentLevels) => [
...currentLevels.slice(1),
waveformHeightFromLevel(level),
]);
current.frame = requestAnimationFrame(tick);
};
audioRef.current = audio;
void context.resume().catch(() => undefined);
audio.frame = requestAnimationFrame(tick);
} catch {
stopWaveform();
}
}, [clearInputHintTimer, onClearError, stopWaveform]);
const cleanupRecording = useCallback(() => {
clearTimer(holdTimerRef);
clearInputHintTimer();
clearTimer(maxTimerRef);
stopWaveform();
streamRef.current?.getTracks().forEach((track) => track.stop());
streamRef.current = null;
mediaRecorderRef.current = null;
startPendingRef.current = false;
shortcutActiveRef.current = false;
noInputHintVisibleRef.current = false;
}, [clearInputHintTimer, stopWaveform]);
const stopRecording = useCallback(() => {
const recorder = mediaRecorderRef.current;
if (!recorder || recorder.state === "inactive") return;
recorder.stop();
}, []);
const stopRecordingWhenReady = useCallback(() => {
const recorder = mediaRecorderRef.current;
if (recorder && recorder.state !== "inactive") {
stopRecording();
} else if (startPendingRef.current) {
stopAfterStartRef.current = true;
}
}, [stopRecording]);
const startRecording = useCallback(async () => {
if (!onTranscribeAudio || state !== "idle" || startPendingRef.current) return;
if (!navigator.mediaDevices?.getUserMedia || typeof MediaRecorder === "undefined") {
onError("unsupported");
return;
}
startPendingRef.current = true;
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
const recorder = new MediaRecorder(stream, mediaRecorderOptions());
chunksRef.current = [];
streamRef.current = stream;
mediaRecorderRef.current = recorder;
startedAtRef.current = Date.now();
levelObservedRef.current = false;
peakLevelRef.current = 0;
levelReliableRef.current = false;
noInputHintVisibleRef.current = false;
setElapsedMs(0);
startWaveform(stream);
recorder.ondataavailable = (event) => {
if (event.data.size > 0) chunksRef.current.push(event.data);
};
recorder.onstop = () => {
const chunks = chunksRef.current.splice(0);
const durationMs = Math.max(0, Date.now() - startedAtRef.current);
const mimeType = recorder.mimeType || "audio/webm";
const hasMeasuredSilence =
levelReliableRef.current
&& levelObservedRef.current
&& peakLevelRef.current < VOICE_MIN_LEVEL;
cleanupRecording();
if (chunks.length === 0) {
setState("idle");
return;
}
if (durationMs < VOICE_RECORDING_MIN_MS) {
setState("idle");
onError("tooShort");
return;
}
if (hasMeasuredSilence) {
setState("idle");
onError("noInput");
return;
}
setState("transcribing");
void blobToDataUrl(new Blob(chunks, { type: mimeType }))
.then((dataUrl) => onTranscribeAudio(dataUrl, { durationMs }))
.then(onTranscript)
.catch((error) => onError(transcriptionErrorKey(error)))
.finally(() => setState("idle"));
};
recorder.start();
setState("recording");
onClearError();
maxTimerRef.current = setTimeout(stopRecording, VOICE_RECORDING_MAX_MS);
inputHintTimerRef.current = setTimeout(() => {
const recording = mediaRecorderRef.current?.state === "recording";
if (
!recording
|| !levelReliableRef.current
|| !levelObservedRef.current
|| peakLevelRef.current >= VOICE_MIN_LEVEL
) {
return;
}
noInputHintVisibleRef.current = true;
onError("noInput");
}, VOICE_NO_INPUT_HINT_MS);
} catch {
cleanupRecording();
setState("idle");
onError("permission");
}
}, [
cleanupRecording,
onClearError,
onError,
onTranscribeAudio,
onTranscript,
startWaveform,
state,
stopRecording,
]);
const startRecordingWithDeferredStop = useCallback(() => {
stopAfterStartRef.current = false;
void startRecording().then(() => {
if (!stopAfterStartRef.current) return;
stopAfterStartRef.current = false;
stopRecording();
});
}, [startRecording, stopRecording]);
const beginPress = useCallback((event: ReactPointerEvent<HTMLButtonElement>) => {
if (event.pointerType === "mouse" && event.button !== 0) return;
if (!onTranscribeAudio || disabled || state !== "idle") return;
clearTimer(holdTimerRef);
try {
event.currentTarget.setPointerCapture(event.pointerId);
} catch {
// Some embedded runtimes do not expose pointer capture for toolbar buttons.
}
holdTimerRef.current = setTimeout(() => {
holdTimerRef.current = null;
holdActiveRef.current = true;
suppressNextClick();
startRecordingWithDeferredStop();
}, VOICE_HOLD_START_MS);
}, [disabled, onTranscribeAudio, startRecordingWithDeferredStop, state, suppressNextClick]);
const endPress = useCallback(() => {
const wasHoldRecording = holdActiveRef.current;
clearTimer(holdTimerRef);
if (!wasHoldRecording) return;
holdActiveRef.current = false;
suppressNextClick();
stopRecordingWhenReady();
}, [stopRecordingWhenReady, suppressNextClick]);
const handleClick = useCallback(() => {
if (suppressClickRef.current) {
clearSuppressClickTimer();
suppressClickRef.current = false;
return;
}
if (state === "recording") stopRecording();
else void startRecording();
}, [clearSuppressClickTimer, startRecording, state, stopRecording]);
const beginShortcutHold = useCallback(() => {
if (!onTranscribeAudio || disabled || state !== "idle" || shortcutActiveRef.current) return;
shortcutActiveRef.current = true;
startRecordingWithDeferredStop();
}, [disabled, onTranscribeAudio, startRecordingWithDeferredStop, state]);
const endShortcutHold = useCallback(() => {
if (!shortcutActiveRef.current) return;
shortcutActiveRef.current = false;
stopRecordingWhenReady();
}, [stopRecordingWhenReady]);
useEffect(() => {
if (state !== "recording") {
setElapsedMs(0);
return;
}
const updateElapsed = () => {
setElapsedMs(Math.max(0, Date.now() - startedAtRef.current));
};
updateElapsed();
const interval = window.setInterval(updateElapsed, 250);
return () => window.clearInterval(interval);
}, [state]);
useEffect(() => cleanupRecording, [cleanupRecording]);
useEffect(() => () => clearSuppressClickTimer(), [clearSuppressClickTimer]);
return {
beginShortcutHold,
beginPress,
buttonDisabled: disabled || state === "transcribing",
elapsedLabel: formatVoiceElapsed(elapsedMs),
endShortcutHold,
endPress,
handleClick,
isRecording: state === "recording",
levels,
state,
};
}
interface VoiceAudioState {
analyser: AnalyserNode;
context: AudioContext;
data: Uint8Array<ArrayBuffer>;
frame: number | null;
source: MediaStreamAudioSourceNode;
}
function clearTimer(ref: { current: ReturnType<typeof setTimeout> | null }) {
if (ref.current !== null) {
clearTimeout(ref.current);
ref.current = null;
}
}
function mediaRecorderOptions(): MediaRecorderOptions | undefined {
if (typeof MediaRecorder === "undefined") return undefined;
const mimeType = VOICE_MIME_CANDIDATES.find((type) => MediaRecorder.isTypeSupported(type));
return mimeType ? { mimeType } : undefined;
}
function formatVoiceElapsed(ms: number): string {
const seconds = Math.max(0, Math.floor(ms / 1000));
const minutes = Math.floor(seconds / 60);
return `${minutes}:${String(seconds % 60).padStart(2, "0")}`;
}
function audioContextConstructor(): typeof AudioContext | undefined {
if (typeof window === "undefined") return undefined;
return window.AudioContext
?? (window as unknown as { webkitAudioContext?: typeof AudioContext }).webkitAudioContext;
}
function voiceLevelFromSamples(samples: ArrayLike<number>): number {
if (samples.length === 0) return 0;
let sum = 0;
for (let index = 0; index < samples.length; index += 1) {
const centered = (samples[index] - 128) / 128;
sum += centered * centered;
}
const rms = Math.sqrt(sum / samples.length);
return Math.min(1, Math.pow(rms * 4.2, 0.72));
}
function waveformHeightFromLevel(level: number): number {
if (level < VOICE_MIN_LEVEL) return VOICE_WAVEFORM_SILENT_HEIGHT;
const activeLevel = Math.min(1, (level - VOICE_MIN_LEVEL) / (1 - VOICE_MIN_LEVEL));
return Math.round(
VOICE_WAVEFORM_MIN_HEIGHT
+ activeLevel * (VOICE_WAVEFORM_MAX_HEIGHT - VOICE_WAVEFORM_MIN_HEIGHT),
);
}
function blobToDataUrl(blob: Blob): Promise<string> {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = () => {
if (typeof reader.result === "string") resolve(reader.result);
else reject(new Error("invalid_data_url"));
};
reader.onerror = () => reject(reader.error ?? new Error("read_failed"));
reader.readAsDataURL(blob);
});
}
function transcriptionErrorKey(error: unknown): VoiceRecorderErrorKey {
const detail = error instanceof Error ? error.message : "";
if (detail === "not_configured") return "notConfigured";
if (detail === "duration") return "tooLong";
return "failed";
}

View File

@ -73,6 +73,7 @@
"models": "Models", "models": "Models",
"providers": "Providers", "providers": "Providers",
"image": "Image", "image": "Image",
"voice": "Voice",
"browser": "Web", "browser": "Web",
"cliApps": "CLI Apps", "cliApps": "CLI Apps",
"mcp": "MCP", "mcp": "MCP",
@ -99,7 +100,8 @@
"capabilities": "Capabilities", "capabilities": "Capabilities",
"apps": "Apps", "apps": "Apps",
"nativeHost": "Native host", "nativeHost": "Native host",
"hostSafety": "App safety" "hostSafety": "App safety",
"voiceInput": "Voice input"
}, },
"models": { "models": {
"selectModel": "Select model", "selectModel": "Select model",
@ -161,7 +163,13 @@
"engine": "Engine", "engine": "Engine",
"logs": "Logs", "logs": "Logs",
"diagnostics": "Diagnostics", "diagnostics": "Diagnostics",
"contextWindow": "Context window" "contextWindow": "Context window",
"transcription": "Transcription",
"transcriptionProvider": "Provider",
"transcriptionProviderStatus": "Provider status",
"transcriptionModel": "Model",
"transcriptionLanguage": "Language",
"voiceLimits": "Limits"
}, },
"help": { "help": {
"theme": "Switch between light and dark appearance.", "theme": "Switch between light and dark appearance.",
@ -200,7 +208,12 @@
"diagnostics": "Export a small runtime report for support.", "diagnostics": "Export a small runtime report for support.",
"localServiceAccessNative": "Allow Full Access shell commands to reach services on this Mac.", "localServiceAccessNative": "Allow Full Access shell commands to reach services on this Mac.",
"webuiDefaultAccessNative": "Used by native chats without a project-specific permission.", "webuiDefaultAccessNative": "Used by native chats without a project-specific permission.",
"contextWindow": "Choose the default context budget for this model configuration." "contextWindow": "Choose the default context budget for this model configuration.",
"transcription": "Transcribe microphone input before sending it. Chat channel voice messages use the same settings.",
"transcriptionProvider": "Uses the matching provider credentials from Providers.",
"transcriptionProviderStatus": "API keys stay under providers, not in transcription settings.",
"transcriptionModel": "Leave as the resolved default unless your provider needs a custom model id.",
"transcriptionLanguage": "Optional ISO-639 hint such as en, zh, ja, or ko."
}, },
"timezone": { "timezone": {
"select": "Select timezone", "select": "Select timezone",
@ -391,6 +404,7 @@
"totalProviders": "{{count}} available", "totalProviders": "{{count}} available",
"webSearch": "Web search", "webSearch": "Web search",
"imageGeneration": "Image generation", "imageGeneration": "Image generation",
"voiceInput": "Voice input",
"workspace": "Workspace" "workspace": "Workspace"
}, },
"usage": { "usage": {
@ -486,6 +500,11 @@
"rawInstructions": "Raw SKILL.md", "rawInstructions": "Raw SKILL.md",
"rawInstructionsEmpty": "No raw instructions.", "rawInstructionsEmpty": "No raw instructions.",
"detailDescription": "Details for {{name}}." "detailDescription": "Details for {{name}}."
},
"voice": {
"selectProvider": "Select provider",
"configureProvider": "Configure provider",
"languageAuto": "Auto"
} }
}, },
"chat": { "chat": {
@ -678,6 +697,21 @@
"deepResearch": "Deep research", "deepResearch": "Deep research",
"voice": "Voice input" "voice": "Voice input"
}, },
"voice": {
"hint": "Click to dictate or hold",
"stop": "Stop recording",
"transcribing": "Transcribing...",
"recordingStatus": "Recording {{time}}"
},
"voiceErrors": {
"unsupported": "Voice input is not supported in this browser.",
"permission": "Microphone permission is required.",
"notConfigured": "Configure a transcription provider first.",
"tooLong": "Recording is too long.",
"tooShort": "Hold a little longer to record voice.",
"noInput": "No microphone input detected.",
"failed": "Could not transcribe audio."
},
"slash": { "slash": {
"ariaLabel": "Slash commands", "ariaLabel": "Slash commands",
"label": "commands", "label": "commands",

View File

@ -73,6 +73,7 @@
"models": "Modelos", "models": "Modelos",
"providers": "Proveedores", "providers": "Proveedores",
"image": "Imagen", "image": "Imagen",
"voice": "Voz",
"browser": "Internet", "browser": "Internet",
"runtime": "Sistema", "runtime": "Sistema",
"advanced": "Seguridad", "advanced": "Seguridad",
@ -99,7 +100,8 @@
"mcp": "Servicios MCP", "mcp": "Servicios MCP",
"apps": "Aplicaciones", "apps": "Aplicaciones",
"nativeHost": "Host nativo", "nativeHost": "Host nativo",
"hostSafety": "Seguridad de la app" "hostSafety": "Seguridad de la app",
"voiceInput": "Entrada de voz"
}, },
"rows": { "rows": {
"theme": "Tema", "theme": "Tema",
@ -142,7 +144,13 @@
"engine": "Motor", "engine": "Motor",
"logs": "Registros", "logs": "Registros",
"diagnostics": "Diagnóstico", "diagnostics": "Diagnóstico",
"contextWindow": "Ventana de contexto" "contextWindow": "Ventana de contexto",
"transcription": "Transcripcion",
"transcriptionProvider": "Proveedor",
"transcriptionProviderStatus": "Estado del proveedor",
"transcriptionModel": "Modelo",
"transcriptionLanguage": "Idioma",
"voiceLimits": "Limites"
}, },
"help": { "help": {
"theme": "Cambia entre apariencia clara y oscura.", "theme": "Cambia entre apariencia clara y oscura.",
@ -181,7 +189,12 @@
"diagnostics": "Exporta un pequeño informe de runtime para soporte.", "diagnostics": "Exporta un pequeño informe de runtime para soporte.",
"localServiceAccessNative": "Permite que comandos shell con Full Access alcancen servicios en este Mac.", "localServiceAccessNative": "Permite que comandos shell con Full Access alcancen servicios en este Mac.",
"webuiDefaultAccessNative": "Usado por chats nativos sin permiso específico de proyecto.", "webuiDefaultAccessNative": "Usado por chats nativos sin permiso específico de proyecto.",
"contextWindow": "Elige el presupuesto de contexto predeterminado para esta configuración de modelo." "contextWindow": "Elige el presupuesto de contexto predeterminado para esta configuración de modelo.",
"transcription": "Transcribe la entrada del microfono antes de enviarla. Los mensajes de voz de los canales usan la misma configuracion.",
"transcriptionProvider": "Usa las credenciales del proveedor correspondiente en Proveedores.",
"transcriptionProviderStatus": "Las claves API permanecen en proveedores, no en la configuracion de transcripcion.",
"transcriptionModel": "Dejalo como el valor predeterminado resuelto salvo que el proveedor necesite un id de modelo personalizado.",
"transcriptionLanguage": "Pista ISO-639 opcional, como en, zh, ja o ko."
}, },
"values": { "values": {
"light": "Claro", "light": "Claro",
@ -283,6 +296,7 @@
"totalProviders": "{{count}} disponibles", "totalProviders": "{{count}} disponibles",
"webSearch": "Búsqueda web", "webSearch": "Búsqueda web",
"imageGeneration": "Generación de imágenes", "imageGeneration": "Generación de imágenes",
"voiceInput": "Entrada de voz",
"workspace": "Espacio de trabajo" "workspace": "Espacio de trabajo"
}, },
"usage": { "usage": {
@ -486,6 +500,11 @@
"rawInstructions": "SKILL.md original", "rawInstructions": "SKILL.md original",
"rawInstructionsEmpty": "No hay instrucciones originales.", "rawInstructionsEmpty": "No hay instrucciones originales.",
"detailDescription": "Detalles de {{name}}." "detailDescription": "Detalles de {{name}}."
},
"voice": {
"selectProvider": "Seleccionar proveedor",
"configureProvider": "Configurar proveedor",
"languageAuto": "Auto"
} }
}, },
"chat": { "chat": {
@ -678,6 +697,21 @@
"deepResearch": "Investigación profunda", "deepResearch": "Investigación profunda",
"voice": "Entrada de voz" "voice": "Entrada de voz"
}, },
"voice": {
"hint": "Haz clic para dictar o mantén",
"stop": "Detener grabación",
"transcribing": "Transcribiendo...",
"recordingStatus": "Grabando {{time}}"
},
"voiceErrors": {
"unsupported": "Este navegador no admite entrada de voz.",
"permission": "Se requiere permiso de micrófono.",
"notConfigured": "Configura primero un proveedor de transcripción.",
"tooLong": "La grabación es demasiado larga.",
"tooShort": "Mantén pulsado un poco más para grabar voz.",
"noInput": "No se detectó entrada del micrófono.",
"failed": "No se pudo transcribir el audio."
},
"slash": { "slash": {
"ariaLabel": "Comandos slash", "ariaLabel": "Comandos slash",
"label": "comandos", "label": "comandos",

View File

@ -73,6 +73,7 @@
"models": "Modèles", "models": "Modèles",
"providers": "Fournisseurs", "providers": "Fournisseurs",
"image": "Images", "image": "Images",
"voice": "Voix",
"browser": "Internet", "browser": "Internet",
"runtime": "Système", "runtime": "Système",
"advanced": "Sécurité", "advanced": "Sécurité",
@ -99,7 +100,8 @@
"mcp": "Services MCP", "mcp": "Services MCP",
"apps": "Applications", "apps": "Applications",
"nativeHost": "Hôte natif", "nativeHost": "Hôte natif",
"hostSafety": "Sécurité de lapp" "hostSafety": "Sécurité de lapp",
"voiceInput": "Saisie vocale"
}, },
"rows": { "rows": {
"theme": "Thème", "theme": "Thème",
@ -142,7 +144,13 @@
"engine": "Moteur", "engine": "Moteur",
"logs": "Journaux", "logs": "Journaux",
"diagnostics": "Diagnostic", "diagnostics": "Diagnostic",
"contextWindow": "Fenêtre de contexte" "contextWindow": "Fenêtre de contexte",
"transcription": "Transcription",
"transcriptionProvider": "Fournisseur",
"transcriptionProviderStatus": "Etat du fournisseur",
"transcriptionModel": "Modele",
"transcriptionLanguage": "Langue",
"voiceLimits": "Limites"
}, },
"help": { "help": {
"theme": "Basculer entre lapparence claire et sombre.", "theme": "Basculer entre lapparence claire et sombre.",
@ -181,7 +189,12 @@
"diagnostics": "Exporte un petit rapport dexécution pour le support.", "diagnostics": "Exporte un petit rapport dexécution pour le support.",
"localServiceAccessNative": "Autorise les commandes shell Full Access à atteindre les services sur ce Mac.", "localServiceAccessNative": "Autorise les commandes shell Full Access à atteindre les services sur ce Mac.",
"webuiDefaultAccessNative": "Utilisé par les chats natifs sans permission propre au projet.", "webuiDefaultAccessNative": "Utilisé par les chats natifs sans permission propre au projet.",
"contextWindow": "Choisissez le budget de contexte par défaut pour cette configuration de modèle." "contextWindow": "Choisissez le budget de contexte par défaut pour cette configuration de modèle.",
"transcription": "Transcrit l'entree micro avant l'envoi. Les messages vocaux des canaux utilisent les memes reglages.",
"transcriptionProvider": "Utilise les identifiants du fournisseur correspondant dans Fournisseurs.",
"transcriptionProviderStatus": "Les cles API restent dans les fournisseurs, pas dans les reglages de transcription.",
"transcriptionModel": "Laissez le modele resolu par defaut sauf si votre fournisseur exige un id personnalise.",
"transcriptionLanguage": "Indice ISO-639 facultatif, comme en, zh, ja ou ko."
}, },
"values": { "values": {
"light": "Clair", "light": "Clair",
@ -283,6 +296,7 @@
"totalProviders": "{{count}} disponibles", "totalProviders": "{{count}} disponibles",
"webSearch": "Recherche web", "webSearch": "Recherche web",
"imageGeneration": "Génération dimages", "imageGeneration": "Génération dimages",
"voiceInput": "Saisie vocale",
"workspace": "Espace de travail" "workspace": "Espace de travail"
}, },
"usage": { "usage": {
@ -486,6 +500,11 @@
"rawInstructions": "SKILL.md brut", "rawInstructions": "SKILL.md brut",
"rawInstructionsEmpty": "Aucune instruction brute.", "rawInstructionsEmpty": "Aucune instruction brute.",
"detailDescription": "Détails de {{name}}." "detailDescription": "Détails de {{name}}."
},
"voice": {
"selectProvider": "Choisir un fournisseur",
"configureProvider": "Configurer le fournisseur",
"languageAuto": "Auto"
} }
}, },
"chat": { "chat": {
@ -678,6 +697,21 @@
"deepResearch": "Recherche approfondie", "deepResearch": "Recherche approfondie",
"voice": "Entrée vocale" "voice": "Entrée vocale"
}, },
"voice": {
"hint": "Cliquez pour dicter ou maintenez",
"stop": "Arrêter l'enregistrement",
"transcribing": "Transcription...",
"recordingStatus": "Enregistrement {{time}}"
},
"voiceErrors": {
"unsupported": "La saisie vocale n'est pas prise en charge par ce navigateur.",
"permission": "L'autorisation du microphone est requise.",
"notConfigured": "Configurez d'abord un fournisseur de transcription.",
"tooLong": "L'enregistrement est trop long.",
"tooShort": "Maintenez un peu plus longtemps pour enregistrer la voix.",
"noInput": "Aucune entrée microphone détectée.",
"failed": "Impossible de transcrire l'audio."
},
"slash": { "slash": {
"ariaLabel": "Commandes slash", "ariaLabel": "Commandes slash",
"label": "commandes", "label": "commandes",

View File

@ -73,6 +73,7 @@
"models": "Model", "models": "Model",
"providers": "Penyedia", "providers": "Penyedia",
"image": "Gambar", "image": "Gambar",
"voice": "Suara",
"browser": "Internet", "browser": "Internet",
"runtime": "Sistem", "runtime": "Sistem",
"advanced": "Keamanan", "advanced": "Keamanan",
@ -99,7 +100,8 @@
"mcp": "Layanan MCP", "mcp": "Layanan MCP",
"apps": "Aplikasi", "apps": "Aplikasi",
"nativeHost": "Host native", "nativeHost": "Host native",
"hostSafety": "Keamanan aplikasi" "hostSafety": "Keamanan aplikasi",
"voiceInput": "Input suara"
}, },
"rows": { "rows": {
"theme": "Tema", "theme": "Tema",
@ -142,7 +144,13 @@
"engine": "Mesin", "engine": "Mesin",
"logs": "Log", "logs": "Log",
"diagnostics": "Diagnostik", "diagnostics": "Diagnostik",
"contextWindow": "Jendela konteks" "contextWindow": "Jendela konteks",
"transcription": "Transkripsi",
"transcriptionProvider": "Penyedia",
"transcriptionProviderStatus": "Status penyedia",
"transcriptionModel": "Model",
"transcriptionLanguage": "Bahasa",
"voiceLimits": "Batas"
}, },
"help": { "help": {
"theme": "Beralih antara tampilan terang dan gelap.", "theme": "Beralih antara tampilan terang dan gelap.",
@ -181,7 +189,12 @@
"diagnostics": "Exporta un pequeño informe de runtime para soporte.", "diagnostics": "Exporta un pequeño informe de runtime para soporte.",
"localServiceAccessNative": "Permite que comandos shell con Full Access alcancen servicios en este Mac.", "localServiceAccessNative": "Permite que comandos shell con Full Access alcancen servicios en este Mac.",
"webuiDefaultAccessNative": "Usado por chats nativos sin permiso específico de proyecto.", "webuiDefaultAccessNative": "Usado por chats nativos sin permiso específico de proyecto.",
"contextWindow": "Pilih anggaran konteks default untuk konfigurasi model ini." "contextWindow": "Pilih anggaran konteks default untuk konfigurasi model ini.",
"transcription": "Transkripsikan input mikrofon sebelum dikirim. Pesan suara channel memakai pengaturan yang sama.",
"transcriptionProvider": "Menggunakan kredensial penyedia yang sesuai dari Providers.",
"transcriptionProviderStatus": "API key tetap berada di providers, bukan di pengaturan transkripsi.",
"transcriptionModel": "Biarkan memakai default yang teresolusi kecuali penyedia membutuhkan id model khusus.",
"transcriptionLanguage": "Petunjuk ISO-639 opsional, seperti en, zh, ja, atau ko."
}, },
"values": { "values": {
"light": "Terang", "light": "Terang",
@ -283,6 +296,7 @@
"totalProviders": "{{count}} tersedia", "totalProviders": "{{count}} tersedia",
"webSearch": "Pencarian web", "webSearch": "Pencarian web",
"imageGeneration": "Pembuatan gambar", "imageGeneration": "Pembuatan gambar",
"voiceInput": "Input suara",
"workspace": "Ruang kerja" "workspace": "Ruang kerja"
}, },
"usage": { "usage": {
@ -486,6 +500,11 @@
"rawInstructions": "SKILL.md mentah", "rawInstructions": "SKILL.md mentah",
"rawInstructionsEmpty": "Tidak ada instruksi mentah.", "rawInstructionsEmpty": "Tidak ada instruksi mentah.",
"detailDescription": "Detail untuk {{name}}." "detailDescription": "Detail untuk {{name}}."
},
"voice": {
"selectProvider": "Pilih penyedia",
"configureProvider": "Konfigurasi penyedia",
"languageAuto": "Auto"
} }
}, },
"chat": { "chat": {
@ -678,6 +697,21 @@
"deepResearch": "Riset mendalam", "deepResearch": "Riset mendalam",
"voice": "Input suara" "voice": "Input suara"
}, },
"voice": {
"hint": "Klik untuk mendikte atau tahan",
"stop": "Hentikan rekaman",
"transcribing": "Mentranskripsi...",
"recordingStatus": "Merekam {{time}}"
},
"voiceErrors": {
"unsupported": "Input suara tidak didukung di browser ini.",
"permission": "Izin mikrofon diperlukan.",
"notConfigured": "Konfigurasikan penyedia transkripsi terlebih dahulu.",
"tooLong": "Rekaman terlalu panjang.",
"tooShort": "Tahan sedikit lebih lama untuk merekam suara.",
"noInput": "Tidak ada input mikrofon yang terdeteksi.",
"failed": "Tidak dapat mentranskripsi audio."
},
"slash": { "slash": {
"ariaLabel": "Perintah slash", "ariaLabel": "Perintah slash",
"label": "perintah", "label": "perintah",

View File

@ -73,6 +73,7 @@
"models": "モデル", "models": "モデル",
"providers": "プロバイダー", "providers": "プロバイダー",
"image": "画像", "image": "画像",
"voice": "音声",
"browser": "ウェブ", "browser": "ウェブ",
"runtime": "システム", "runtime": "システム",
"advanced": "セキュリティ", "advanced": "セキュリティ",
@ -99,7 +100,8 @@
"mcp": "MCP サービス", "mcp": "MCP サービス",
"apps": "アプリ", "apps": "アプリ",
"nativeHost": "ネイティブホスト", "nativeHost": "ネイティブホスト",
"hostSafety": "アプリの安全性" "hostSafety": "アプリの安全性",
"voiceInput": "音声入力"
}, },
"rows": { "rows": {
"theme": "テーマ", "theme": "テーマ",
@ -142,7 +144,13 @@
"engine": "エンジン", "engine": "エンジン",
"logs": "ログ", "logs": "ログ",
"diagnostics": "診断", "diagnostics": "診断",
"contextWindow": "コンテキストウィンドウ" "contextWindow": "コンテキストウィンドウ",
"transcription": "文字起こし",
"transcriptionProvider": "プロバイダー",
"transcriptionProviderStatus": "プロバイダー状態",
"transcriptionModel": "モデル",
"transcriptionLanguage": "言語",
"voiceLimits": "制限"
}, },
"help": { "help": {
"theme": "ライト表示とダーク表示を切り替えます。", "theme": "ライト表示とダーク表示を切り替えます。",
@ -181,7 +189,12 @@
"diagnostics": "サポート用の小さなランタイムレポートを書き出します。", "diagnostics": "サポート用の小さなランタイムレポートを書き出します。",
"localServiceAccessNative": "Full Access の shell コマンドがこの Mac 上のサービスにアクセスできるようにします。", "localServiceAccessNative": "Full Access の shell コマンドがこの Mac 上のサービスにアクセスできるようにします。",
"webuiDefaultAccessNative": "プロジェクト固有の権限がないネイティブチャットで使用します。", "webuiDefaultAccessNative": "プロジェクト固有の権限がないネイティブチャットで使用します。",
"contextWindow": "このモデル設定で使う既定のコンテキスト予算を選択します。" "contextWindow": "このモデル設定で使う既定のコンテキスト予算を選択します。",
"transcription": "マイク入力を送信前に文字起こしします。チャネルの音声メッセージも同じ設定を使います。",
"transcriptionProvider": "プロバイダー設定にある対応する認証情報を使います。",
"transcriptionProviderStatus": "APIキーは文字起こし設定ではなくプロバイダー側に保存されます。",
"transcriptionModel": "プロバイダーがカスタムモデルIDを必要としない限り、解決済みのデフォルトのままにします。",
"transcriptionLanguage": "en、zh、ja、ko などの任意の ISO-639 ヒント。"
}, },
"values": { "values": {
"light": "ライト", "light": "ライト",
@ -283,6 +296,7 @@
"totalProviders": "{{count}} 個利用可能", "totalProviders": "{{count}} 個利用可能",
"webSearch": "Web 検索", "webSearch": "Web 検索",
"imageGeneration": "画像生成", "imageGeneration": "画像生成",
"voiceInput": "音声入力",
"workspace": "ワークスペース" "workspace": "ワークスペース"
}, },
"usage": { "usage": {
@ -486,6 +500,11 @@
"rawInstructions": "元の SKILL.md", "rawInstructions": "元の SKILL.md",
"rawInstructionsEmpty": "元の説明はありません。", "rawInstructionsEmpty": "元の説明はありません。",
"detailDescription": "{{name}} の詳細。" "detailDescription": "{{name}} の詳細。"
},
"voice": {
"selectProvider": "プロバイダーを選択",
"configureProvider": "プロバイダーを設定",
"languageAuto": "自動"
} }
}, },
"chat": { "chat": {
@ -678,6 +697,21 @@
"deepResearch": "詳細調査", "deepResearch": "詳細調査",
"voice": "音声入力" "voice": "音声入力"
}, },
"voice": {
"hint": "クリックして音声入力、または長押し",
"stop": "録音を停止",
"transcribing": "文字起こし中...",
"recordingStatus": "録音中 {{time}}"
},
"voiceErrors": {
"unsupported": "このブラウザーは音声入力に対応していません。",
"permission": "マイクの許可が必要です。",
"notConfigured": "先に文字起こしプロバイダーを設定してください。",
"tooLong": "録音が長すぎます。",
"tooShort": "もう少し長く録音してください。",
"noInput": "マイク入力が検出されませんでした。",
"failed": "音声を文字起こしできませんでした。"
},
"slash": { "slash": {
"ariaLabel": "スラッシュコマンド", "ariaLabel": "スラッシュコマンド",
"label": "コマンド", "label": "コマンド",

View File

@ -73,6 +73,7 @@
"models": "모델", "models": "모델",
"providers": "제공자", "providers": "제공자",
"image": "이미지", "image": "이미지",
"voice": "음성",
"browser": "웹", "browser": "웹",
"runtime": "시스템", "runtime": "시스템",
"advanced": "보안", "advanced": "보안",
@ -99,7 +100,8 @@
"mcp": "MCP 서비스", "mcp": "MCP 서비스",
"apps": "앱", "apps": "앱",
"nativeHost": "네이티브 호스트", "nativeHost": "네이티브 호스트",
"hostSafety": "앱 보안" "hostSafety": "앱 보안",
"voiceInput": "음성 입력"
}, },
"rows": { "rows": {
"theme": "테마", "theme": "테마",
@ -142,7 +144,13 @@
"engine": "엔진", "engine": "엔진",
"logs": "로그", "logs": "로그",
"diagnostics": "진단", "diagnostics": "진단",
"contextWindow": "컨텍스트 창" "contextWindow": "컨텍스트 창",
"transcription": "전사",
"transcriptionProvider": "제공자",
"transcriptionProviderStatus": "제공자 상태",
"transcriptionModel": "모델",
"transcriptionLanguage": "언어",
"voiceLimits": "제한"
}, },
"help": { "help": {
"theme": "밝은 모드와 어두운 모드를 전환합니다.", "theme": "밝은 모드와 어두운 모드를 전환합니다.",
@ -181,7 +189,12 @@
"diagnostics": "지원용 작은 런타임 보고서를 내보냅니다.", "diagnostics": "지원용 작은 런타임 보고서를 내보냅니다.",
"localServiceAccessNative": "Full Access shell 명령이 이 Mac의 서비스에 접근할 수 있게 합니다.", "localServiceAccessNative": "Full Access shell 명령이 이 Mac의 서비스에 접근할 수 있게 합니다.",
"webuiDefaultAccessNative": "프로젝트별 권한이 없는 네이티브 채팅에 사용됩니다.", "webuiDefaultAccessNative": "프로젝트별 권한이 없는 네이티브 채팅에 사용됩니다.",
"contextWindow": "이 모델 구성의 기본 컨텍스트 예산을 선택합니다." "contextWindow": "이 모델 구성의 기본 컨텍스트 예산을 선택합니다.",
"transcription": "마이크 입력을 보내기 전에 텍스트로 변환합니다. 채널 음성 메시지도 같은 설정을 사용합니다.",
"transcriptionProvider": "Providers에 저장된 해당 제공자의 인증 정보를 사용합니다.",
"transcriptionProviderStatus": "API 키는 transcription 설정이 아니라 providers 아래에 유지됩니다.",
"transcriptionModel": "제공자가 사용자 지정 모델 ID를 요구하지 않으면 해석된 기본값을 사용하세요.",
"transcriptionLanguage": "en, zh, ja, ko 같은 선택적 ISO-639 힌트입니다."
}, },
"values": { "values": {
"light": "라이트", "light": "라이트",
@ -283,6 +296,7 @@
"totalProviders": "{{count}}개 사용 가능", "totalProviders": "{{count}}개 사용 가능",
"webSearch": "웹 검색", "webSearch": "웹 검색",
"imageGeneration": "이미지 생성", "imageGeneration": "이미지 생성",
"voiceInput": "음성 입력",
"workspace": "작업공간" "workspace": "작업공간"
}, },
"usage": { "usage": {
@ -486,6 +500,11 @@
"rawInstructions": "원본 SKILL.md", "rawInstructions": "원본 SKILL.md",
"rawInstructionsEmpty": "원본 지침이 없습니다.", "rawInstructionsEmpty": "원본 지침이 없습니다.",
"detailDescription": "{{name}} 세부 정보." "detailDescription": "{{name}} 세부 정보."
},
"voice": {
"selectProvider": "제공자 선택",
"configureProvider": "제공자 설정",
"languageAuto": "자동"
} }
}, },
"chat": { "chat": {
@ -678,6 +697,21 @@
"deepResearch": "심층 조사", "deepResearch": "심층 조사",
"voice": "음성 입력" "voice": "음성 입력"
}, },
"voice": {
"hint": "클릭해 받아쓰거나 길게 누르기",
"stop": "녹음 중지",
"transcribing": "변환 중...",
"recordingStatus": "녹음 중 {{time}}"
},
"voiceErrors": {
"unsupported": "이 브라우저는 음성 입력을 지원하지 않습니다.",
"permission": "마이크 권한이 필요합니다.",
"notConfigured": "먼저 음성 변환 제공업체를 설정하세요.",
"tooLong": "녹음 시간이 너무 깁니다.",
"tooShort": "음성을 녹음하려면 조금 더 길게 눌러 주세요.",
"noInput": "마이크 입력이 감지되지 않았습니다.",
"failed": "오디오를 변환하지 못했습니다."
},
"slash": { "slash": {
"ariaLabel": "슬래시 명령", "ariaLabel": "슬래시 명령",
"label": "명령", "label": "명령",

View File

@ -73,6 +73,7 @@
"models": "Mô hình", "models": "Mô hình",
"providers": "Nhà cung cấp", "providers": "Nhà cung cấp",
"image": "Hình ảnh", "image": "Hình ảnh",
"voice": "Giọng nói",
"browser": "Trang web", "browser": "Trang web",
"runtime": "Hệ thống", "runtime": "Hệ thống",
"advanced": "Bảo mật", "advanced": "Bảo mật",
@ -99,7 +100,8 @@
"mcp": "Dịch vụ MCP", "mcp": "Dịch vụ MCP",
"apps": "Ứng dụng", "apps": "Ứng dụng",
"nativeHost": "Host gốc", "nativeHost": "Host gốc",
"hostSafety": "An toàn ứng dụng" "hostSafety": "An toàn ứng dụng",
"voiceInput": "Nhap giong noi"
}, },
"rows": { "rows": {
"theme": "Chủ đề", "theme": "Chủ đề",
@ -142,7 +144,13 @@
"engine": "Bộ máy", "engine": "Bộ máy",
"logs": "Nhật ký", "logs": "Nhật ký",
"diagnostics": "Chẩn đoán", "diagnostics": "Chẩn đoán",
"contextWindow": "Cửa sổ ngữ cảnh" "contextWindow": "Cửa sổ ngữ cảnh",
"transcription": "Phien am",
"transcriptionProvider": "Nha cung cap",
"transcriptionProviderStatus": "Trang thai nha cung cap",
"transcriptionModel": "Mo hinh",
"transcriptionLanguage": "Ngon ngu",
"voiceLimits": "Gioi han"
}, },
"help": { "help": {
"theme": "Chuyển giữa giao diện sáng và tối.", "theme": "Chuyển giữa giao diện sáng và tối.",
@ -181,7 +189,12 @@
"diagnostics": "Exporta un pequeño informe de runtime para soporte.", "diagnostics": "Exporta un pequeño informe de runtime para soporte.",
"localServiceAccessNative": "Permite que comandos shell con Full Access alcancen servicios en este Mac.", "localServiceAccessNative": "Permite que comandos shell con Full Access alcancen servicios en este Mac.",
"webuiDefaultAccessNative": "Usado por chats nativos sin permiso específico de proyecto.", "webuiDefaultAccessNative": "Usado por chats nativos sin permiso específico de proyecto.",
"contextWindow": "Chọn ngân sách ngữ cảnh mặc định cho cấu hình mô hình này." "contextWindow": "Chọn ngân sách ngữ cảnh mặc định cho cấu hình mô hình này.",
"transcription": "Phien am dau vao micro truoc khi gui. Tin nhan giong noi tu kenh chat dung cung cai dat.",
"transcriptionProvider": "Dung thong tin xac thuc cua nha cung cap tu Providers.",
"transcriptionProviderStatus": "API key nam trong providers, khong nam trong cai dat transcription.",
"transcriptionModel": "Giu mac dinh da resolve tru khi nha cung cap can id model tuy chinh.",
"transcriptionLanguage": "Goi y ISO-639 tuy chon, nhu en, zh, ja hoac ko."
}, },
"values": { "values": {
"light": "Sáng", "light": "Sáng",
@ -283,6 +296,7 @@
"totalProviders": "{{count}} khả dụng", "totalProviders": "{{count}} khả dụng",
"webSearch": "Tìm kiếm web", "webSearch": "Tìm kiếm web",
"imageGeneration": "Tạo hình ảnh", "imageGeneration": "Tạo hình ảnh",
"voiceInput": "Nhập bằng giọng nói",
"workspace": "Không gian làm việc" "workspace": "Không gian làm việc"
}, },
"usage": { "usage": {
@ -486,6 +500,11 @@
"rawInstructions": "SKILL.md gốc", "rawInstructions": "SKILL.md gốc",
"rawInstructionsEmpty": "Không có hướng dẫn gốc.", "rawInstructionsEmpty": "Không có hướng dẫn gốc.",
"detailDescription": "Chi tiết cho {{name}}." "detailDescription": "Chi tiết cho {{name}}."
},
"voice": {
"selectProvider": "Chon nha cung cap",
"configureProvider": "Cau hinh nha cung cap",
"languageAuto": "Tu dong"
} }
}, },
"chat": { "chat": {
@ -678,6 +697,21 @@
"deepResearch": "Nghiên cứu sâu", "deepResearch": "Nghiên cứu sâu",
"voice": "Nhập bằng giọng nói" "voice": "Nhập bằng giọng nói"
}, },
"voice": {
"hint": "Bấm để đọc chính tả hoặc nhấn giữ",
"stop": "Dừng ghi âm",
"transcribing": "Đang chép lời...",
"recordingStatus": "Đang ghi {{time}}"
},
"voiceErrors": {
"unsupported": "Trình duyệt này không hỗ trợ nhập bằng giọng nói.",
"permission": "Cần quyền truy cập micrô.",
"notConfigured": "Hãy cấu hình nhà cung cấp chép lời trước.",
"tooLong": "Bản ghi âm quá dài.",
"tooShort": "Giữ lâu hơn một chút để ghi âm giọng nói.",
"noInput": "Không phát hiện đầu vào micrô.",
"failed": "Không thể chép lời âm thanh."
},
"slash": { "slash": {
"ariaLabel": "Lệnh slash", "ariaLabel": "Lệnh slash",
"label": "lệnh", "label": "lệnh",

View File

@ -73,6 +73,7 @@
"models": "模型", "models": "模型",
"providers": "提供商", "providers": "提供商",
"image": "图片", "image": "图片",
"voice": "语音",
"browser": "网页", "browser": "网页",
"cliApps": "CLI 应用", "cliApps": "CLI 应用",
"mcp": "MCP", "mcp": "MCP",
@ -99,7 +100,8 @@
"capabilities": "能力", "capabilities": "能力",
"apps": "应用", "apps": "应用",
"nativeHost": "原生宿主", "nativeHost": "原生宿主",
"hostSafety": "应用安全" "hostSafety": "应用安全",
"voiceInput": "语音识别"
}, },
"models": { "models": {
"selectModel": "选择模型", "selectModel": "选择模型",
@ -161,7 +163,13 @@
"engine": "引擎", "engine": "引擎",
"logs": "日志", "logs": "日志",
"diagnostics": "诊断", "diagnostics": "诊断",
"contextWindow": "上下文窗口" "contextWindow": "上下文窗口",
"transcription": "语音转写",
"transcriptionProvider": "提供商",
"transcriptionProviderStatus": "提供商状态",
"transcriptionModel": "模型",
"transcriptionLanguage": "语言",
"voiceLimits": "限制"
}, },
"help": { "help": {
"theme": "在浅色和深色外观之间切换。", "theme": "在浅色和深色外观之间切换。",
@ -200,7 +208,12 @@
"diagnostics": "导出一份用于支持排查的小型运行报告。", "diagnostics": "导出一份用于支持排查的小型运行报告。",
"localServiceAccessNative": "允许完全访问权限下的 shell 命令访问这台 Mac 上的服务。", "localServiceAccessNative": "允许完全访问权限下的 shell 命令访问这台 Mac 上的服务。",
"webuiDefaultAccessNative": "用于没有单独项目权限的原生聊天。", "webuiDefaultAccessNative": "用于没有单独项目权限的原生聊天。",
"contextWindow": "选择此模型配置的默认上下文预算。" "contextWindow": "选择此模型配置的默认上下文预算。",
"transcription": "发送前先把麦克风输入转写到输入框。聊天渠道里的语音消息也使用同一套设置。",
"transcriptionProvider": "使用「提供商」中对应提供商的凭据。",
"transcriptionProviderStatus": "API Key 仍保存在 providers 里,不写进 transcription 设置。",
"transcriptionModel": "除非提供商需要自定义模型 ID否则保持解析后的默认值即可。",
"transcriptionLanguage": "可选 ISO-639 语言提示,例如 en、zh、ja 或 ko。"
}, },
"timezone": { "timezone": {
"select": "选择时区", "select": "选择时区",
@ -391,6 +404,7 @@
"totalProviders": "共 {{count}} 个可用", "totalProviders": "共 {{count}} 个可用",
"webSearch": "网页搜索", "webSearch": "网页搜索",
"imageGeneration": "图片生成", "imageGeneration": "图片生成",
"voiceInput": "语音识别",
"workspace": "工作区" "workspace": "工作区"
}, },
"usage": { "usage": {
@ -486,6 +500,11 @@
"rawInstructions": "原始 SKILL.md", "rawInstructions": "原始 SKILL.md",
"rawInstructionsEmpty": "没有原始说明。", "rawInstructionsEmpty": "没有原始说明。",
"detailDescription": "{{name}} 的详情。" "detailDescription": "{{name}} 的详情。"
},
"voice": {
"selectProvider": "选择提供商",
"configureProvider": "配置提供商",
"languageAuto": "自动"
} }
}, },
"chat": { "chat": {
@ -677,6 +696,21 @@
"deepResearch": "深度研究", "deepResearch": "深度研究",
"voice": "语音输入" "voice": "语音输入"
}, },
"voice": {
"hint": "点击进行听写或长按",
"stop": "停止录音",
"transcribing": "正在转写...",
"recordingStatus": "正在录音 {{time}}"
},
"voiceErrors": {
"unsupported": "当前浏览器不支持语音输入。",
"permission": "需要麦克风权限。",
"notConfigured": "请先配置转写提供商。",
"tooLong": "录音时间太长。",
"tooShort": "请稍微多录一会儿。",
"noInput": "没有检测到麦克风输入。",
"failed": "语音转写失败。"
},
"slash": { "slash": {
"ariaLabel": "斜杠命令", "ariaLabel": "斜杠命令",
"label": "命令", "label": "命令",

View File

@ -73,6 +73,7 @@
"models": "模型", "models": "模型",
"providers": "提供商", "providers": "提供商",
"image": "圖片", "image": "圖片",
"voice": "語音",
"browser": "網頁", "browser": "網頁",
"runtime": "系統", "runtime": "系統",
"advanced": "安全", "advanced": "安全",
@ -99,7 +100,8 @@
"mcp": "MCP 服務", "mcp": "MCP 服務",
"apps": "應用", "apps": "應用",
"nativeHost": "原生宿主", "nativeHost": "原生宿主",
"hostSafety": "App 安全" "hostSafety": "App 安全",
"voiceInput": "語音辨識"
}, },
"rows": { "rows": {
"theme": "主題", "theme": "主題",
@ -142,7 +144,13 @@
"engine": "引擎", "engine": "引擎",
"logs": "日誌", "logs": "日誌",
"diagnostics": "診斷", "diagnostics": "診斷",
"contextWindow": "上下文視窗" "contextWindow": "上下文視窗",
"transcription": "語音轉寫",
"transcriptionProvider": "提供商",
"transcriptionProviderStatus": "提供商狀態",
"transcriptionModel": "模型",
"transcriptionLanguage": "語言",
"voiceLimits": "限制"
}, },
"help": { "help": {
"theme": "在淺色與深色外觀之間切換。", "theme": "在淺色與深色外觀之間切換。",
@ -181,7 +189,12 @@
"diagnostics": "匯出一份用於支援排查的小型執行報告。", "diagnostics": "匯出一份用於支援排查的小型執行報告。",
"localServiceAccessNative": "允許完全訪問權限下的 shell 命令訪問這台 Mac 上的服務。", "localServiceAccessNative": "允許完全訪問權限下的 shell 命令訪問這台 Mac 上的服務。",
"webuiDefaultAccessNative": "用於沒有單獨專案權限的原生聊天。", "webuiDefaultAccessNative": "用於沒有單獨專案權限的原生聊天。",
"contextWindow": "選擇此模型配置的預設上下文預算。" "contextWindow": "選擇此模型配置的預設上下文預算。",
"transcription": "送出前先把麥克風輸入轉寫到輸入框。聊天渠道的語音訊息也使用同一組設定。",
"transcriptionProvider": "使用「提供商」中對應提供商的憑證。",
"transcriptionProviderStatus": "API Key 仍保存在 providers 裡,不寫進 transcription 設定。",
"transcriptionModel": "除非提供商需要自訂模型 ID否則保持解析後的預設值即可。",
"transcriptionLanguage": "可選 ISO-639 語言提示,例如 en、zh、ja 或 ko。"
}, },
"values": { "values": {
"light": "淺色", "light": "淺色",
@ -283,6 +296,7 @@
"totalProviders": "共 {{count}} 個可用", "totalProviders": "共 {{count}} 個可用",
"webSearch": "網頁搜尋", "webSearch": "網頁搜尋",
"imageGeneration": "圖片生成", "imageGeneration": "圖片生成",
"voiceInput": "語音辨識",
"workspace": "工作區" "workspace": "工作區"
}, },
"usage": { "usage": {
@ -486,6 +500,11 @@
"rawInstructions": "原始 SKILL.md", "rawInstructions": "原始 SKILL.md",
"rawInstructionsEmpty": "沒有原始說明。", "rawInstructionsEmpty": "沒有原始說明。",
"detailDescription": "{{name}} 的詳細資訊。" "detailDescription": "{{name}} 的詳細資訊。"
},
"voice": {
"selectProvider": "選擇提供商",
"configureProvider": "設定提供商",
"languageAuto": "自動"
} }
}, },
"chat": { "chat": {
@ -678,6 +697,21 @@
"deepResearch": "深度研究", "deepResearch": "深度研究",
"voice": "語音輸入" "voice": "語音輸入"
}, },
"voice": {
"hint": "點擊進行聽寫或長按",
"stop": "停止錄音",
"transcribing": "正在轉寫...",
"recordingStatus": "正在錄音 {{time}}"
},
"voiceErrors": {
"unsupported": "目前瀏覽器不支援語音輸入。",
"permission": "需要麥克風權限。",
"notConfigured": "請先設定轉寫提供商。",
"tooLong": "錄音時間太長。",
"tooShort": "請稍微多錄一會兒。",
"noInput": "沒有偵測到麥克風輸入。",
"failed": "語音轉寫失敗。"
},
"slash": { "slash": {
"ariaLabel": "斜線命令", "ariaLabel": "斜線命令",
"label": "命令", "label": "命令",

210
webui/src/lib/ansi.ts Normal file
View File

@ -0,0 +1,210 @@
export type AnsiSegment = {
text: string;
style?: AnsiStyle;
};
export type AnsiStyle = {
backgroundColor?: string;
color?: string;
fontStyle?: "italic";
fontWeight?: number;
opacity?: number;
textDecorationLine?: "underline";
};
type AnsiState = {
backgroundColor?: string;
bold: boolean;
color?: string;
dim: boolean;
inverse: boolean;
italic: boolean;
underline: boolean;
};
const ESC = String.fromCharCode(27);
const ANSI_PATTERN = new RegExp(`${ESC}\\[[0-?]*[ -/]*[@-~]`, "g");
const ANSI_COLORS = [
"#000000",
"#cd3131",
"#0dbc79",
"#e5e510",
"#2472c8",
"#bc3fbc",
"#11a8cd",
"#e5e5e5",
];
const ANSI_BRIGHT_COLORS = [
"#666666",
"#f14c4c",
"#23d18b",
"#f5f543",
"#3b8eea",
"#d670d6",
"#29b8db",
"#ffffff",
];
const RGB_STEPS = [0, 95, 135, 175, 215, 255];
export function hasAnsi(value: string): boolean {
ANSI_PATTERN.lastIndex = 0;
return ANSI_PATTERN.test(value);
}
export function stripAnsi(value: string): string {
ANSI_PATTERN.lastIndex = 0;
return value.replace(ANSI_PATTERN, "");
}
function initialState(): AnsiState {
return {
bold: false,
dim: false,
inverse: false,
italic: false,
underline: false,
};
}
function colorFrom256(value: number): string | undefined {
if (value < 0 || value > 255) return undefined;
if (value < 8) return ANSI_COLORS[value];
if (value < 16) return ANSI_BRIGHT_COLORS[value - 8];
if (value < 232) {
const offset = value - 16;
const red = RGB_STEPS[Math.floor(offset / 36)];
const green = RGB_STEPS[Math.floor((offset % 36) / 6)];
const blue = RGB_STEPS[offset % 6];
return `rgb(${red}, ${green}, ${blue})`;
}
const gray = 8 + ((value - 232) * 10);
return `rgb(${gray}, ${gray}, ${gray})`;
}
function colorFromRgb(red: number, green: number, blue: number): string | undefined {
if ([red, green, blue].some((value) => !Number.isFinite(value) || value < 0 || value > 255)) {
return undefined;
}
return `rgb(${red}, ${green}, ${blue})`;
}
function normalizedSgrParams(sequence: string): number[] | null {
if (!sequence.endsWith("m")) return null;
const body = sequence.slice(2, -1).trim();
if (!body) return [0];
return body.split(/[;:]/).map((part) => {
const value = Number.parseInt(part || "0", 10);
return Number.isFinite(value) ? value : 0;
});
}
function applyExtendedColor(
state: AnsiState,
params: number[],
index: number,
key: "color" | "backgroundColor",
): number {
const mode = params[index + 1];
if (mode === 5) {
const color = colorFrom256(params[index + 2]);
if (color) state[key] = color;
return index + 2;
}
if (mode === 2) {
const color = colorFromRgb(params[index + 2], params[index + 3], params[index + 4]);
if (color) state[key] = color;
return index + 4;
}
return index;
}
function applySgrParams(state: AnsiState, params: number[]): void {
for (let index = 0; index < params.length; index += 1) {
const code = params[index];
if (code === 0) {
Object.assign(state, initialState());
} else if (code === 1) {
state.bold = true;
state.dim = false;
} else if (code === 2) {
state.dim = true;
state.bold = false;
} else if (code === 3) {
state.italic = true;
} else if (code === 4) {
state.underline = true;
} else if (code === 7) {
state.inverse = true;
} else if (code === 22) {
state.bold = false;
state.dim = false;
} else if (code === 23) {
state.italic = false;
} else if (code === 24) {
state.underline = false;
} else if (code === 27) {
state.inverse = false;
} else if (code === 39) {
delete state.color;
} else if (code === 49) {
delete state.backgroundColor;
} else if (code >= 30 && code <= 37) {
state.color = ANSI_COLORS[code - 30];
} else if (code >= 40 && code <= 47) {
state.backgroundColor = ANSI_COLORS[code - 40];
} else if (code >= 90 && code <= 97) {
state.color = ANSI_BRIGHT_COLORS[code - 90];
} else if (code >= 100 && code <= 107) {
state.backgroundColor = ANSI_BRIGHT_COLORS[code - 100];
} else if (code === 38) {
index = applyExtendedColor(state, params, index, "color");
} else if (code === 48) {
index = applyExtendedColor(state, params, index, "backgroundColor");
}
}
}
function styleFromState(state: AnsiState): AnsiStyle | undefined {
const foreground = state.inverse ? state.backgroundColor : state.color;
const background = state.inverse ? state.color : state.backgroundColor;
const style: AnsiStyle = {};
if (foreground) style.color = foreground;
if (background) style.backgroundColor = background;
if (state.bold) style.fontWeight = 700;
if (state.dim) style.opacity = 0.72;
if (state.italic) style.fontStyle = "italic";
if (state.underline) style.textDecorationLine = "underline";
return Object.keys(style).length ? style : undefined;
}
export function parseAnsiSegments(value: string): AnsiSegment[] {
const segments: AnsiSegment[] = [];
const state = initialState();
let cursor = 0;
ANSI_PATTERN.lastIndex = 0;
for (const match of value.matchAll(ANSI_PATTERN)) {
const index = match.index ?? 0;
if (index > cursor) {
segments.push({
text: value.slice(cursor, index),
style: styleFromState(state),
});
}
const params = normalizedSgrParams(match[0]);
if (params) applySgrParams(state, params);
cursor = index + match[0].length;
}
if (cursor < value.length) {
segments.push({
text: value.slice(cursor),
style: styleFromState(state),
});
}
return segments.filter((segment) => segment.text.length > 0);
}

View File

@ -16,6 +16,7 @@ import type {
SkillDetail, SkillDetail,
SkillsPayload, SkillsPayload,
SlashCommand, SlashCommand,
TranscriptionSettingsUpdate,
WebSearchSettingsUpdate, WebSearchSettingsUpdate,
WorkspacesPayload, WorkspacesPayload,
WebuiThreadPersistedPayload, WebuiThreadPersistedPayload,
@ -547,3 +548,21 @@ export async function updateImageGenerationSettings(
token, token,
); );
} }
export async function updateTranscriptionSettings(
token: string,
update: TranscriptionSettingsUpdate,
base: string = "",
): Promise<SettingsPayload> {
const query = new URLSearchParams();
query.set("enabled", String(update.enabled));
query.set("provider", update.provider);
query.set("model", update.model);
query.set("language", update.language);
query.set("max_duration_sec", String(update.maxDurationSec));
query.set("max_upload_mb", String(update.maxUploadMb));
return request<SettingsPayload>(
`${base}/api/settings/transcription/update?${query}`,
token,
);
}

View File

@ -95,6 +95,12 @@ interface PendingNewChat {
timer: ReturnType<typeof setTimeout>; timer: ReturnType<typeof setTimeout>;
} }
interface PendingTranscription {
resolve: (text: string) => void;
reject: (err: Error) => void;
timer: ReturnType<typeof setTimeout>;
}
export interface NanobotClientOptions { export interface NanobotClientOptions {
url: string; url: string;
reconnect?: boolean; reconnect?: boolean;
@ -132,6 +138,7 @@ export class NanobotClient {
/** Latest ``goal_state`` snapshot per ``chat_id`` (multi-session isolation). */ /** Latest ``goal_state`` snapshot per ``chat_id`` (multi-session isolation). */
private goalStateByChatId = new Map<string, GoalStateWsPayload>(); private goalStateByChatId = new Map<string, GoalStateWsPayload>();
private pendingNewChat: PendingNewChat | null = null; private pendingNewChat: PendingNewChat | null = null;
private pendingTranscriptions = new Map<string, PendingTranscription>();
// Frames queued while the socket is not yet OPEN // Frames queued while the socket is not yet OPEN
private sendQueue: Outbound[] = []; private sendQueue: Outbound[] = [];
private reconnectAttempts = 0; private reconnectAttempts = 0;
@ -320,6 +327,27 @@ export class NanobotClient {
}); });
} }
transcribeAudio(
dataUrl: string,
options?: { durationMs?: number; timeoutMs?: number },
): Promise<string> {
const requestId = crypto.randomUUID();
const timeoutMs = options?.timeoutMs ?? 120_000;
return new Promise<string>((resolve, reject) => {
const timer = setTimeout(() => {
this.pendingTranscriptions.delete(requestId);
reject(new Error("transcription timed out"));
}, timeoutMs);
this.pendingTranscriptions.set(requestId, { resolve, reject, timer });
this.queueSend({
type: "transcribe_audio",
request_id: requestId,
data_url: dataUrl,
...(options?.durationMs !== undefined ? { duration_ms: options.durationMs } : {}),
});
});
}
attach(chatId: string): void { attach(chatId: string): void {
this.knownChats.add(chatId); this.knownChats.add(chatId);
if (this.socket?.readyState === WS_OPEN) { if (this.socket?.readyState === WS_OPEN) {
@ -425,6 +453,16 @@ export class NanobotClient {
return; return;
} }
if (parsed.event === "transcription_result") {
this.resolveTranscription(parsed.request_id, parsed.text);
return;
}
if (parsed.event === "transcription_error") {
this.rejectTranscription(parsed.request_id, parsed.detail || "error");
return;
}
if (parsed.event === "session_updated") { if (parsed.event === "session_updated") {
this.emitSessionUpdate(parsed.chat_id, parsed.scope, parsed.workspace_scope); this.emitSessionUpdate(parsed.chat_id, parsed.scope, parsed.workspace_scope);
return; return;
@ -500,6 +538,7 @@ export class NanobotClient {
this.pendingNewChat.reject(new Error("socket closed")); this.pendingNewChat.reject(new Error("socket closed"));
this.pendingNewChat = null; this.pendingNewChat = null;
} }
this.rejectAllTranscriptions("socket closed");
// Surface structured reasons *before* reconnect logic so the UI can // Surface structured reasons *before* reconnect logic so the UI can
// display the error even while the client transparently reconnects. // display the error even while the client transparently reconnects.
// Browsers populate ``CloseEvent.code`` with the wire-level close code; // Browsers populate ``CloseEvent.code`` with the wire-level close code;
@ -528,6 +567,34 @@ export class NanobotClient {
} }
} }
private resolveTranscription(requestId: string, text: string): void {
const pending = this.pendingTranscriptions.get(requestId);
if (!pending) return;
clearTimeout(pending.timer);
this.pendingTranscriptions.delete(requestId);
pending.resolve(text);
}
private rejectTranscription(requestId: string | undefined, detail: string): void {
if (!requestId) {
this.rejectAllTranscriptions(detail);
return;
}
const pending = this.pendingTranscriptions.get(requestId);
if (!pending) return;
clearTimeout(pending.timer);
this.pendingTranscriptions.delete(requestId);
pending.reject(new Error(detail));
}
private rejectAllTranscriptions(detail: string): void {
for (const [requestId, pending] of this.pendingTranscriptions) {
clearTimeout(pending.timer);
pending.reject(new Error(detail));
this.pendingTranscriptions.delete(requestId);
}
}
private scheduleReconnect(): void { private scheduleReconnect(): void {
this.setStatus("reconnecting"); this.setStatus("reconnecting");
const attempt = this.reconnectAttempts++; const attempt = this.reconnectAttempts++;

View File

@ -391,6 +391,23 @@ export interface SettingsPayload {
default_api_base?: string | null; default_api_base?: string | null;
}>; }>;
}; };
transcription?: {
enabled: boolean;
provider: string;
provider_configured: boolean;
model: string;
language: string | null;
max_duration_sec: number;
max_upload_mb: number;
providers: Array<{
name: string;
label: string;
configured: boolean;
api_key_hint?: string | null;
api_base?: string | null;
default_api_base?: string | null;
}>;
};
runtime: { runtime: {
config_path: string; config_path: string;
workspace_path: string; workspace_path: string;
@ -680,6 +697,15 @@ export interface ImageGenerationSettingsUpdate {
maxImagesPerTurn: number; maxImagesPerTurn: number;
} }
export interface TranscriptionSettingsUpdate {
enabled: boolean;
provider: string;
model: string;
language: string;
maxDurationSec: number;
maxUploadMb: number;
}
export interface SlashCommand { export interface SlashCommand {
command: string; command: string;
title: string; title: string;
@ -782,6 +808,13 @@ export type InboundEvent =
scope?: "metadata" | "thread" | string; scope?: "metadata" | "thread" | string;
workspace_scope?: WorkspaceScopePayload; workspace_scope?: WorkspaceScopePayload;
} }
| { event: "transcription_result"; request_id: string; text: string }
| {
event: "transcription_error";
request_id?: string;
detail?: string;
provider?: string;
}
| { event: "error"; chat_id?: string; detail?: string; reason?: string }; | { event: "error"; chat_id?: string; detail?: string; reason?: string };
/** Base64-encoded image attached to an outbound ``message`` envelope. /** Base64-encoded image attached to an outbound ``message`` envelope.
@ -845,6 +878,7 @@ export type Outbound =
| { type: "new_chat"; workspace_scope?: WorkspaceScopePayload } | { type: "new_chat"; workspace_scope?: WorkspaceScopePayload }
| { type: "attach"; chat_id: string } | { type: "attach"; chat_id: string }
| { type: "set_workspace_scope"; chat_id: string; workspace_scope: WorkspaceScopePayload } | { type: "set_workspace_scope"; chat_id: string; workspace_scope: WorkspaceScopePayload }
| { type: "transcribe_audio"; request_id: string; data_url: string; duration_ms?: number }
| { | {
type: "message"; type: "message";
chat_id: string; chat_id: string;

View File

@ -1172,13 +1172,13 @@ describe("App layout", () => {
it("restores the settings section from the URL hash after a page reload", async () => { it("restores the settings section from the URL hash after a page reload", async () => {
mockFetchRoutes({ "/api/settings": baseSettingsPayload() }); mockFetchRoutes({ "/api/settings": baseSettingsPayload() });
window.history.replaceState(null, "", "/#/settings?section=models"); window.history.replaceState(null, "", "/#/settings?section=voice");
render(<App />); render(<App />);
await waitFor(() => expect(connectSpy).toHaveBeenCalled()); await waitFor(() => expect(connectSpy).toHaveBeenCalled());
expect(await screen.findByRole("heading", { name: "Models" })).toBeInTheDocument(); expect(await screen.findByRole("heading", { name: "Voice input" })).toBeInTheDocument();
expect(window.location.hash).toBe("#/settings?section=models"); expect(window.location.hash).toBe("#/settings?section=voice");
}); });
it("updates the URL hash when switching settings sections", async () => { it("updates the URL hash when switching settings sections", async () => {
@ -1197,6 +1197,11 @@ describe("App layout", () => {
expect(await screen.findByRole("heading", { name: "Models" })).toBeInTheDocument(); expect(await screen.findByRole("heading", { name: "Models" })).toBeInTheDocument();
expect(window.location.hash).toBe("#/settings?section=models"); expect(window.location.hash).toBe("#/settings?section=models");
fireEvent.click(within(settingsNav).getByRole("button", { name: "Voice" }));
expect(await screen.findByRole("heading", { name: "Voice input" })).toBeInTheDocument();
expect(window.location.hash).toBe("#/settings?section=voice");
}); });
it("opens Apps from the main sidebar without replacing the sidebar", async () => { it("opens Apps from the main sidebar without replacing the sidebar", async () => {

View File

@ -1,4 +1,5 @@
import { act, render, screen } from "@testing-library/react"; import { act, render, screen } from "@testing-library/react";
import userEvent from "@testing-library/user-event";
import { describe, expect, it, vi } from "vitest"; import { describe, expect, it, vi } from "vitest";
import { CodeBlock } from "@/components/CodeBlock"; import { CodeBlock } from "@/components/CodeBlock";
@ -87,6 +88,64 @@ describe("CodeBlock", () => {
expect(screen.getByText("const value = 1;")).toBeInTheDocument(); expect(screen.getByText("const value = 1;")).toBeInTheDocument();
}); });
it("renders ANSI output without mounting the syntax highlighter", () => {
render(
<ThemeProvider theme="dark">
<CodeBlock
language="ansi"
code={"\x1b[32mPASS\x1b[0m <script>alert(1)</script>"}
/>
</ThemeProvider>,
);
expect(screen.queryByTestId("highlighted-code")).not.toBeInTheDocument();
expect(screen.getByTestId("ansi-code")).toBeInTheDocument();
expect(screen.getByTestId("ansi-code").closest(".not-prose")).toBeTruthy();
expect(screen.getByText("ansi")).toBeInTheDocument();
expect(screen.getByText("PASS")).toHaveStyle({ color: "#0dbc79" });
expect(screen.getByText("<script>alert(1)</script>")).toBeInTheDocument();
expect(document.querySelector("script")).toBeNull();
});
it("detects ANSI sequences in regular code blocks", () => {
render(
<ThemeProvider theme="light">
<CodeBlock
language="text"
code={"\x1b[38;2;35;209;139mtruecolor\x1b[0m"}
/>
</ThemeProvider>,
);
expect(screen.queryByTestId("highlighted-code")).not.toBeInTheDocument();
expect(screen.getByText("truecolor")).toHaveStyle({
color: "rgb(35, 209, 139)",
});
});
it("copies ANSI output as clean text", async () => {
const user = userEvent.setup();
const writeText = vi.fn().mockResolvedValue(undefined);
Object.defineProperty(navigator, "clipboard", {
configurable: true,
value: { writeText },
});
try {
render(
<ThemeProvider theme="dark">
<CodeBlock language="ansi" code={"\x1b[32mPASS\x1b[0m"} />
</ThemeProvider>,
);
await user.click(screen.getByRole("button", { name: /copy/i }));
expect(writeText).toHaveBeenCalledWith("PASS");
} finally {
Reflect.deleteProperty(navigator, "clipboard");
}
});
it("reads theme from context without creating per-block observers", async () => { it("reads theme from context without creating per-block observers", async () => {
const originalMutationObserver = globalThis.MutationObserver; const originalMutationObserver = globalThis.MutationObserver;
const observer = vi.fn(); const observer = vi.fn();

View File

@ -412,6 +412,61 @@ describe("NanobotClient", () => {
); );
}); });
it("sends transcription requests and resolves transcription results outside chat dispatch", async () => {
const client = new NanobotClient({
url: "ws://test",
reconnect: false,
socketFactory: (url) => new FakeSocket(url) as unknown as WebSocket,
});
const handler = vi.fn();
client.onChat("chat-a", handler);
client.connect();
lastSocket().fakeOpen();
const promise = client.transcribeAudio("data:audio/webm;base64,AAAA", {
durationMs: 1234,
timeoutMs: 1_000,
});
const frame = JSON.parse(lastSocket().sent.at(-1) as string);
expect(frame).toMatchObject({
type: "transcribe_audio",
data_url: "data:audio/webm;base64,AAAA",
duration_ms: 1234,
});
expect(typeof frame.request_id).toBe("string");
lastSocket().fakeMessage({
event: "transcription_result",
request_id: frame.request_id,
text: "hello from voice",
});
await expect(promise).resolves.toBe("hello from voice");
expect(handler).not.toHaveBeenCalled();
});
it("rejects pending transcription requests on server errors and socket close", async () => {
const client = new NanobotClient({
url: "ws://test",
reconnect: false,
socketFactory: (url) => new FakeSocket(url) as unknown as WebSocket,
});
client.connect();
lastSocket().fakeOpen();
const errored = client.transcribeAudio("data:audio/webm;base64,AAAA", { timeoutMs: 1_000 });
const errorFrame = JSON.parse(lastSocket().sent.at(-1) as string);
lastSocket().fakeMessage({
event: "transcription_error",
request_id: errorFrame.request_id,
detail: "not_configured",
});
await expect(errored).rejects.toThrow("not_configured");
const dropped = client.transcribeAudio("data:audio/webm;base64,BBBB", { timeoutMs: 1_000 });
lastSocket().close();
await expect(dropped).rejects.toThrow("socket closed");
});
it("queues sends while connecting and flushes on open", () => { it("queues sends while connecting and flushes on open", () => {
const client = new NanobotClient({ const client = new NanobotClient({
url: "ws://test", url: "ws://test",

View File

@ -1,4 +1,4 @@
import { fireEvent, render, screen, waitFor, within } from "@testing-library/react"; import { act, fireEvent, render, screen, waitFor, within } from "@testing-library/react";
import { afterEach, describe, expect, it, vi } from "vitest"; import { afterEach, describe, expect, it, vi } from "vitest";
import { ThreadComposer } from "@/components/thread/ThreadComposer"; import { ThreadComposer } from "@/components/thread/ThreadComposer";
@ -121,6 +121,7 @@ const MCP_PRESETS: McpPresetInfo[] = [
}, },
]; ];
const ORIGINAL_INNER_HEIGHT = window.innerHeight; const ORIGINAL_INNER_HEIGHT = window.innerHeight;
const ORIGINAL_MEDIA_DEVICES = navigator.mediaDevices;
function mockBlobUrls() { function mockBlobUrls() {
Object.defineProperty(URL, "createObjectURL", { Object.defineProperty(URL, "createObjectURL", {
@ -135,7 +136,16 @@ function mockBlobUrls() {
afterEach(() => { afterEach(() => {
vi.restoreAllMocks(); vi.restoreAllMocks();
vi.unstubAllGlobals();
Reflect.deleteProperty(window, "nanobotHost"); Reflect.deleteProperty(window, "nanobotHost");
if (ORIGINAL_MEDIA_DEVICES) {
Object.defineProperty(navigator, "mediaDevices", {
configurable: true,
value: ORIGINAL_MEDIA_DEVICES,
});
} else {
Reflect.deleteProperty(navigator, "mediaDevices");
}
window.localStorage.clear(); window.localStorage.clear();
Object.defineProperty(window, "innerHeight", { Object.defineProperty(window, "innerHeight", {
value: ORIGINAL_INNER_HEIGHT, value: ORIGINAL_INNER_HEIGHT,
@ -161,6 +171,75 @@ function rect(init: Partial<DOMRect>): DOMRect {
}; };
} }
function mockVoiceRecorder(blob = new Blob(["voice"], { type: "audio/webm" })) {
const stopTrack = vi.fn();
const getUserMedia = vi.fn(async () => ({
getTracks: () => [{ stop: stopTrack }],
}));
Object.defineProperty(navigator, "mediaDevices", {
configurable: true,
value: { getUserMedia },
});
class FakeMediaRecorder {
static isTypeSupported = vi.fn((type: string) => type === "audio/webm");
state: RecordingState = "inactive";
mimeType = blob.type;
ondataavailable: ((event: BlobEvent) => void) | null = null;
onstop: (() => void) | null = null;
start() {
this.state = "recording";
}
stop() {
this.state = "inactive";
this.ondataavailable?.({ data: blob } as BlobEvent);
this.onstop?.();
}
}
vi.stubGlobal("MediaRecorder", FakeMediaRecorder);
return { getUserMedia, stopTrack };
}
function mockVoiceAudioInput(sample = 128, state: AudioContextState = "running") {
class FakeAudioContext {
state = state;
createMediaStreamSource() {
return { connect: vi.fn(), disconnect: vi.fn() };
}
createAnalyser() {
return {
fftSize: 256,
smoothingTimeConstant: 0,
disconnect: vi.fn(),
getByteTimeDomainData: (data: Uint8Array) => data.fill(sample),
};
}
close = vi.fn(async () => undefined);
resume = vi.fn(async () => undefined);
}
vi.stubGlobal("AudioContext", FakeAudioContext);
vi.spyOn(window, "requestAnimationFrame").mockImplementation((callback) =>
window.setTimeout(() => callback(performance.now()), 16) as unknown as number
);
vi.spyOn(window, "cancelAnimationFrame").mockImplementation((id) =>
window.clearTimeout(id as unknown as number)
);
}
async function waitForVoiceCapture(): Promise<void> {
await act(async () => {
await new Promise((resolve) => setTimeout(resolve, 700));
});
}
describe("ThreadComposer", () => { describe("ThreadComposer", () => {
it("renders a readonly hero model composer when provided", () => { it("renders a readonly hero model composer when provided", () => {
render( render(
@ -209,6 +288,245 @@ describe("ThreadComposer", () => {
expect(screen.queryByText(/Enter to send/)).not.toBeInTheDocument(); expect(screen.queryByText(/Enter to send/)).not.toBeInTheDocument();
}); });
it("transcribes voice input into the composer without sending", async () => {
mockVoiceRecorder();
const onSend = vi.fn();
const onTranscribeAudio = vi.fn(async () => "hello voice");
render(
<ThreadComposer
onSend={onSend}
onTranscribeAudio={onTranscribeAudio}
placeholder="Type your message..."
/>,
);
fireEvent.click(screen.getByRole("button", { name: "Voice input" }));
expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
await waitForVoiceCapture();
fireEvent.click(await screen.findByRole("button", { name: "Stop recording" }));
await waitFor(() => expect(onTranscribeAudio).toHaveBeenCalledWith(
expect.stringMatching(/^data:audio\/webm;base64,/),
expect.objectContaining({ durationMs: expect.any(Number) }),
));
await waitFor(() => expect(screen.getByLabelText("Message input")).toHaveValue("hello voice"));
expect(onSend).not.toHaveBeenCalled();
});
it("does not start duplicate voice recordings while microphone access is pending", async () => {
const { getUserMedia, stopTrack } = mockVoiceRecorder();
let resolveStream: ((stream: MediaStream) => void) | undefined;
getUserMedia.mockImplementation(() => new Promise((resolve) => {
resolveStream = resolve as (stream: MediaStream) => void;
}));
const onTranscribeAudio = vi.fn(async () => "one recording");
render(
<ThreadComposer
onSend={vi.fn()}
onTranscribeAudio={onTranscribeAudio}
placeholder="Type your message..."
/>,
);
const voiceButton = screen.getByRole("button", { name: "Voice input" });
fireEvent.click(voiceButton);
fireEvent.click(voiceButton);
expect(getUserMedia).toHaveBeenCalledTimes(1);
await act(async () => {
resolveStream?.({ getTracks: () => [{ stop: stopTrack }] } as unknown as MediaStream);
});
expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
await waitForVoiceCapture();
fireEvent.click(await screen.findByRole("button", { name: "Stop recording" }));
await waitFor(() => expect(onTranscribeAudio).toHaveBeenCalledTimes(1));
await waitFor(() => expect(screen.getByLabelText("Message input")).toHaveValue("one recording"));
});
it("supports press-and-hold voice recording", async () => {
mockVoiceRecorder();
const onSend = vi.fn();
const onTranscribeAudio = vi.fn(async () => "held voice");
render(
<ThreadComposer
onSend={onSend}
onTranscribeAudio={onTranscribeAudio}
placeholder="Type your message..."
/>,
);
const voiceButton = screen.getByRole("button", { name: "Voice input" });
fireEvent.pointerDown(voiceButton, { button: 0, pointerId: 1, pointerType: "touch" });
await act(async () => {
await new Promise((resolve) => setTimeout(resolve, 180));
});
expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
await waitForVoiceCapture();
fireEvent.pointerUp(screen.getByRole("button", { name: "Stop recording" }), {
pointerId: 1,
pointerType: "touch",
});
await waitFor(() => expect(onTranscribeAudio).toHaveBeenCalled());
await waitFor(() => expect(screen.getByLabelText("Message input")).toHaveValue("held voice"));
expect(onSend).not.toHaveBeenCalled();
});
it("supports keyboard hold voice recording", async () => {
mockVoiceRecorder();
const onSend = vi.fn();
const onTranscribeAudio = vi.fn(async () => "shortcut voice");
render(
<ThreadComposer
onSend={onSend}
onTranscribeAudio={onTranscribeAudio}
placeholder="Type your message..."
/>,
);
const voiceButton = screen.getByRole("button", { name: "Voice input" });
expect(voiceButton).toHaveAttribute("title", "Click to dictate or hold");
expect(voiceButton).toHaveAttribute("aria-keyshortcuts", "Control+Shift+D");
fireEvent.keyDown(window, { code: "KeyD", ctrlKey: true, key: "D", shiftKey: true });
expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
await waitForVoiceCapture();
fireEvent.keyUp(window, { code: "KeyD", ctrlKey: true, key: "D", shiftKey: true });
await waitFor(() => expect(onTranscribeAudio).toHaveBeenCalled());
await waitFor(() => expect(screen.getByLabelText("Message input")).toHaveValue("shortcut voice"));
expect(onSend).not.toHaveBeenCalled();
});
it("ignores the delayed click emitted after a long-press voice recording", async () => {
const { getUserMedia } = mockVoiceRecorder();
const onTranscribeAudio = vi.fn(async () => "held once");
render(
<ThreadComposer
onSend={vi.fn()}
onTranscribeAudio={onTranscribeAudio}
placeholder="Type your message..."
/>,
);
const voiceButton = screen.getByRole("button", { name: "Voice input" });
fireEvent.pointerDown(voiceButton, { button: 0, pointerId: 1, pointerType: "touch" });
await act(async () => {
await new Promise((resolve) => setTimeout(resolve, 180));
});
expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
await waitForVoiceCapture();
fireEvent.pointerUp(screen.getByRole("button", { name: "Stop recording" }), {
pointerId: 1,
pointerType: "touch",
});
await waitFor(() => expect(screen.getByLabelText("Message input")).toHaveValue("held once"));
await act(async () => {
await new Promise((resolve) => setTimeout(resolve, 20));
});
fireEvent.click(screen.getByRole("button", { name: "Voice input" }));
expect(getUserMedia).toHaveBeenCalledTimes(1);
expect(onTranscribeAudio).toHaveBeenCalledTimes(1);
});
it("keeps existing text when voice transcription fails", async () => {
mockVoiceRecorder();
const onSend = vi.fn();
const onTranscribeAudio = vi.fn(async () => {
throw new Error("not_configured");
});
render(
<ThreadComposer
onSend={onSend}
onTranscribeAudio={onTranscribeAudio}
placeholder="Type your message..."
/>,
);
const input = screen.getByLabelText("Message input");
fireEvent.change(input, { target: { value: "draft" } });
fireEvent.click(screen.getByRole("button", { name: "Voice input" }));
await waitForVoiceCapture();
fireEvent.click(await screen.findByRole("button", { name: "Stop recording" }));
await waitFor(() => {
expect(screen.getByText("Configure a transcription provider first.")).toBeInTheDocument();
});
expect(input).toHaveValue("draft");
expect(onSend).not.toHaveBeenCalled();
});
it("does not transcribe recordings that are too short", async () => {
mockVoiceRecorder();
const onTranscribeAudio = vi.fn(async () => "should not appear");
render(
<ThreadComposer
onSend={vi.fn()}
onTranscribeAudio={onTranscribeAudio}
placeholder="Type your message..."
/>,
);
fireEvent.click(screen.getByRole("button", { name: "Voice input" }));
fireEvent.click(await screen.findByRole("button", { name: "Stop recording" }));
await waitFor(() => {
expect(screen.getByText("Hold a little longer to record voice.")).toBeInTheDocument();
});
expect(onTranscribeAudio).not.toHaveBeenCalled();
});
it("warns during recording when microphone input is silent", async () => {
mockVoiceRecorder();
mockVoiceAudioInput();
const onTranscribeAudio = vi.fn(async () => "should not appear");
render(
<ThreadComposer
onSend={vi.fn()}
onTranscribeAudio={onTranscribeAudio}
placeholder="Type your message..."
/>,
);
fireEvent.click(screen.getByRole("button", { name: "Voice input" }));
expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
await act(async () => {
await new Promise((resolve) => setTimeout(resolve, 1_150));
});
expect(screen.getByText("No microphone input detected.")).toBeInTheDocument();
fireEvent.click(await screen.findByRole("button", { name: "Stop recording" }));
expect(onTranscribeAudio).not.toHaveBeenCalled();
});
it("does not treat unavailable microphone levels as silence", async () => {
mockVoiceRecorder();
mockVoiceAudioInput(128, "suspended");
const onTranscribeAudio = vi.fn(async () => "voice text");
render(
<ThreadComposer
onSend={vi.fn()}
onTranscribeAudio={onTranscribeAudio}
placeholder="Type your message..."
/>,
);
fireEvent.click(screen.getByRole("button", { name: "Voice input" }));
expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
await act(async () => {
await new Promise((resolve) => setTimeout(resolve, 1_150));
});
expect(screen.queryByText("No microphone input detected.")).not.toBeInTheDocument();
fireEvent.click(await screen.findByRole("button", { name: "Stop recording" }));
await waitFor(() => expect(onTranscribeAudio).toHaveBeenCalledTimes(1));
expect(screen.getByDisplayValue("voice text")).toBeInTheDocument();
});
it("renders and changes workspace access mode", async () => { it("renders and changes workspace access mode", async () => {
const onWorkspaceScopeChange = vi.fn(); const onWorkspaceScopeChange = vi.fn();
render( render(