mirror of
https://github.com/HKUDS/nanobot.git
synced 2026-06-15 07:14:08 +00:00
feat(transcription): add shared voice input support (#4232)
* feat(webui): add voice transcription input * feat(webui): render ANSI output in code blocks * refactor(webui): isolate voice recorder logic * refactor(transcription): keep websocket ingress thin * refactor(transcription): resolve channel audio settings on demand * style(webui): neutralize voice waveform color * feat(webui): add voice input tooltip * feat(webui): add voice input keyboard shortcut * fix(webui): distinguish voice shortcut platforms * fix(webui): place voice button after model selector * refactor(webui): share voice hold recording helpers * fix(desktop): allow microphone voice input * fix(webui): stabilize token usage month labels * feat(webui): show voice input on settings overview * fix(webui): label voice capability as recognition * fix(webui): align capability overview status * refactor(webui): isolate transcription socket handling * fix(webui): soften silent voice waveform * refactor(audio): clarify transcription service location * docs(transcription): clarify audio and provider boundaries * fix(exec): reduce session output polling flake
This commit is contained in:
parent
06d454a225
commit
9c81280300
@ -47,6 +47,9 @@
|
||||
],
|
||||
"mac": {
|
||||
"category": "public.app-category.developer-tools",
|
||||
"extendInfo": {
|
||||
"NSMicrophoneUsageDescription": "nanobot uses the microphone to transcribe voice input before you send messages."
|
||||
},
|
||||
"target": [
|
||||
"dmg"
|
||||
]
|
||||
|
||||
@ -15,6 +15,7 @@ import {
|
||||
protocol,
|
||||
session,
|
||||
shell,
|
||||
systemPreferences,
|
||||
} from "electron";
|
||||
import type { IpcMainInvokeEvent, WebContents } from "electron";
|
||||
|
||||
@ -100,6 +101,58 @@ function isTrustedAppUrl(rawUrl: string): boolean {
|
||||
}
|
||||
}
|
||||
|
||||
function isTrustedPermissionRequest(
|
||||
webContents: WebContents | null,
|
||||
details: unknown,
|
||||
): boolean {
|
||||
return [
|
||||
permissionDetail(details, "requestingUrl"),
|
||||
permissionDetail(details, "securityOrigin"),
|
||||
webContents?.getURL(),
|
||||
].some((url) => typeof url === "string" && isTrustedAppUrl(url));
|
||||
}
|
||||
|
||||
function permissionDetail(details: unknown, key: string): unknown {
|
||||
return typeof details === "object" && details !== null
|
||||
? (details as Record<string, unknown>)[key]
|
||||
: undefined;
|
||||
}
|
||||
|
||||
function isAudioOnlyMediaRequest(details: unknown): boolean {
|
||||
const mediaTypes = permissionDetail(details, "mediaTypes");
|
||||
if (Array.isArray(mediaTypes)) {
|
||||
return mediaTypes.includes("audio") && !mediaTypes.includes("video");
|
||||
}
|
||||
return permissionDetail(details, "mediaType") === "audio";
|
||||
}
|
||||
|
||||
async function requestNativeMicrophoneAccess(): Promise<boolean> {
|
||||
if (process.platform !== "darwin") return true;
|
||||
const status = systemPreferences.getMediaAccessStatus("microphone");
|
||||
if (status === "granted") return true;
|
||||
if (status === "denied" || status === "restricted") return false;
|
||||
return await systemPreferences.askForMediaAccess("microphone");
|
||||
}
|
||||
|
||||
function registerPermissionHandlers(): void {
|
||||
session.defaultSession.setPermissionCheckHandler((webContents, permission, _origin, details) => (
|
||||
permission === "media"
|
||||
&& isTrustedPermissionRequest(webContents, details)
|
||||
&& isAudioOnlyMediaRequest(details)
|
||||
));
|
||||
session.defaultSession.setPermissionRequestHandler((webContents, permission, callback, details) => {
|
||||
if (
|
||||
permission !== "media"
|
||||
|| !isTrustedPermissionRequest(webContents, details)
|
||||
|| !isAudioOnlyMediaRequest(details)
|
||||
) {
|
||||
callback(false);
|
||||
return;
|
||||
}
|
||||
void requestNativeMicrophoneAccess().then(callback, () => callback(false));
|
||||
});
|
||||
}
|
||||
|
||||
function assertTrustedIpc(event: IpcMainInvokeEvent): void {
|
||||
const frameUrl = event.senderFrame?.url || event.sender.getURL();
|
||||
if (!isTrustedAppUrl(frameUrl)) {
|
||||
@ -749,6 +802,7 @@ app.whenReady().then(async () => {
|
||||
}
|
||||
|
||||
registerIpcHandlers();
|
||||
registerPermissionHandlers();
|
||||
registerAppProtocol(webDist, devUrl);
|
||||
|
||||
mainWindow = createWindow();
|
||||
|
||||
@ -234,7 +234,7 @@ nanobot channels login <channel_name> --force # re-authenticate
|
||||
| `_handle_message(sender_id, chat_id, content, media?, metadata?, session_key?)` | **Call this when you receive a message.** Checks `is_allowed()`, then publishes to the bus. Automatically sets `_wants_stream` if `supports_streaming` is true. |
|
||||
| `is_allowed(sender_id)` | Checks against `config.allow_from`; `"*"` allows all, `[]` denies all. |
|
||||
| `default_config()` (classmethod) | Returns default config dict for `nanobot onboard`. Override to declare your fields. |
|
||||
| `transcribe_audio(file_path)` | Transcribes audio via Groq Whisper (if configured). |
|
||||
| `transcribe_audio(file_path)` | Transcribes audio via the shared top-level `transcription` config (if configured). |
|
||||
| `supports_streaming` (property) | `True` when config has `"streaming": true` **and** subclass overrides `send_delta()`. |
|
||||
| `is_running` | Returns `self._running`. |
|
||||
| `login(force=False)` | Perform interactive login (e.g. QR code scan). Returns `True` if already authenticated or login succeeds. Override in subclasses that support interactive login. |
|
||||
|
||||
@ -119,7 +119,7 @@ ANTHROPIC_API_KEY="$(bw get password api/anthropic)" nanobot agent
|
||||
## Providers
|
||||
|
||||
> [!TIP]
|
||||
> - **Voice transcription**: Voice messages (Telegram, WhatsApp) are automatically transcribed using Whisper. By default Groq is used (free tier). Set `"transcriptionProvider": "openai"` under `channels` to use OpenAI Whisper instead, and optionally set `"transcriptionLanguage": "en"` (or another ISO-639-1 code) for more accurate transcription. The API key is picked from the matching provider config.
|
||||
> - **Voice transcription**: Voice messages and WebUI/desktop microphone input use the shared top-level `transcription` settings. By default Groq Whisper is used; set `transcription.provider` to `"openai"` to use OpenAI Whisper. API keys still live in the matching `providers.<provider>` config.
|
||||
> - **MiniMax Coding Plan**: Exclusive discount links for the nanobot community: [Overseas](https://platform.minimax.io/subscribe/coding-plan?code=9txpdXw04g&source=link) · [Mainland China](https://platform.minimaxi.com/subscribe/token-plan?code=GILTJpMTqZ&source=link)
|
||||
> - **MiniMax (Mainland China)**: If your API key is from MiniMax's mainland China platform (minimaxi.com), set `"apiBase": "https://api.minimaxi.com/v1"` in your minimax provider config.
|
||||
> - **MiniMax thinking mode**: Use `providers.minimaxAnthropic` when you want `reasoningEffort` / thinking mode. MiniMax exposes that capability through its Anthropic-compatible endpoint, so nanobot keeps it as a separate provider instead of guessing MiniMax-specific thinking parameters on the generic OpenAI-compatible `minimax` endpoint. It uses the same `MINIMAX_API_KEY`. Default Anthropic-compatible base URL: `https://api.minimax.io/anthropic`; for mainland China use `https://api.minimaxi.com/anthropic`.
|
||||
@ -1100,6 +1100,61 @@ Set `agents.defaults.modelPreset` to start with a named preset:
|
||||
|
||||
When `modelPreset` is `null` or omitted, startup uses the implicit `default` preset from `agents.defaults.*`. Runtime changes made with `/model <preset>` are not written back to `config.json`; they affect future turns until the process restarts or another model/config change replaces them.
|
||||
|
||||
## Transcription Settings
|
||||
|
||||
Audio transcription is a shared capability used by chat-channel voice messages and by WebUI/desktop microphone input. Chat-channel voice messages are transcribed automatically before they enter the agent. WebUI and desktop microphone input is transcribed into the composer first, so you can edit the text before sending.
|
||||
|
||||
Configure transcription under the top-level `transcription` section:
|
||||
|
||||
```json
|
||||
{
|
||||
"transcription": {
|
||||
"enabled": true,
|
||||
"provider": "groq",
|
||||
"model": null,
|
||||
"language": null,
|
||||
"maxDurationSec": 120,
|
||||
"maxUploadMb": 25
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
| Setting | Default | Description |
|
||||
|---------|---------|-------------|
|
||||
| `enabled` | `true` | Enables audio transcription for both chat-channel voice messages and WebUI/desktop microphone input. |
|
||||
| `provider` | `"groq"` | Transcription backend: `"groq"` or `"openai"`. |
|
||||
| `model` | provider default | Optional transcription model override. Defaults to `whisper-large-v3` for Groq and `whisper-1` for OpenAI. |
|
||||
| `language` | `null` | Optional ISO-639 language hint, e.g. `"en"`, `"zh"`, `"ko"`, or `"ja"`. |
|
||||
| `maxDurationSec` | `120` | Maximum WebUI/desktop recording duration. |
|
||||
| `maxUploadMb` | `25` | Maximum WebUI/desktop audio upload size. |
|
||||
|
||||
Provider and language resolution is intentionally ordered for backwards compatibility:
|
||||
|
||||
1. `transcription.provider` / `transcription.language`
|
||||
2. Legacy `channels.transcriptionProvider` / `channels.transcriptionLanguage`
|
||||
3. Built-in defaults (`provider: "groq"`, no language hint)
|
||||
|
||||
The legacy `channels.*` transcription fields existed before transcription became a shared capability across chat channels and WebUI/desktop microphone input. They are still read so older `config.json` files keep working, but they are no longer the preferred configuration surface. If both old and new fields are present, the top-level `transcription` values are the source of truth.
|
||||
|
||||
Transcription credentials are intentionally not stored in `transcription`. Put the API key and optional endpoint in the matching provider config:
|
||||
|
||||
```json
|
||||
{
|
||||
"providers": {
|
||||
"groq": {
|
||||
"apiKey": "gsk-...",
|
||||
"apiBase": "https://api.groq.com/openai/v1"
|
||||
}
|
||||
},
|
||||
"transcription": {
|
||||
"provider": "groq",
|
||||
"language": "zh"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Selecting a transcription provider does not configure credentials by itself. For example, the effective provider may default to Groq for compatibility, but transcription is only usable when `providers.groq.apiKey` or the matching environment-backed config is available. The Settings UI writes only the top-level `transcription` fields.
|
||||
|
||||
## Channel Settings
|
||||
|
||||
Global settings that apply to all channels. Configure under the `channels` section in `~/.nanobot/config.json`:
|
||||
@ -1111,8 +1166,6 @@ Global settings that apply to all channels. Configure under the `channels` secti
|
||||
"sendToolHints": false,
|
||||
"extractDocumentText": true,
|
||||
"sendMaxRetries": 3,
|
||||
"transcriptionProvider": "groq",
|
||||
"transcriptionLanguage": null,
|
||||
"telegram": { ... }
|
||||
}
|
||||
}
|
||||
@ -1125,8 +1178,8 @@ Global settings that apply to all channels. Configure under the `channels` secti
|
||||
| `showReasoning` | `true` | Allow channels to surface model reasoning/thinking content (DeepSeek-R1 `reasoning_content`, Anthropic `thinking_blocks`, inline `<think>` tags). Reasoning flows as a dedicated stream with `_reasoning_delta` / `_reasoning_end` markers — channels override `send_reasoning_delta` / `send_reasoning_end` to render in-place updates. Even with `true`, channels without those overrides stay no-op silently. Currently surfaced on CLI and WebSocket/WebUI (italic shimmer header, auto-collapses after the stream ends); Telegram / Slack / Discord / Feishu / WeChat / Matrix keep the base no-op until their bubble UI is adapted. Independent of `sendProgress`. |
|
||||
| `extractDocumentText` | `true` | Extract supported document/text attachments into the model prompt. Set to `false` to keep document content out of the prompt and include attachment path references instead. |
|
||||
| `sendMaxRetries` | `3` | Max delivery attempts per outbound message, including the initial send (0-10 configured, minimum 1 actual attempt) |
|
||||
| `transcriptionProvider` | `"groq"` | Voice transcription backend: `"groq"` (free tier, default) or `"openai"`. API key and optional `apiBase` are auto-resolved from the matching provider config. Chat-style bases such as `https://api.groq.com/openai/v1` are normalized to the audio transcription endpoint. |
|
||||
| `transcriptionLanguage` | `null` | Optional ISO-639-1 language hint for audio transcription, e.g. `"en"`, `"ko"`, `"ja"`. |
|
||||
|
||||
`channels.transcriptionProvider` and `channels.transcriptionLanguage` are deprecated compatibility fields. They remain as a read-only fallback for older configs, but new configuration should use top-level `transcription.provider` and `transcription.language`.
|
||||
|
||||
`sendProgress` and `sendToolHints` can also be overridden per channel. The
|
||||
global values stay as defaults for channels that do not set their own value:
|
||||
|
||||
@ -24,6 +24,7 @@ DEFAULT_WAIT_FOR_MS = 10_000
|
||||
MAX_WAIT_FOR_MS = 120_000
|
||||
DEFAULT_MAX_OUTPUT_CHARS = 10_000
|
||||
MAX_OUTPUT_CHARS = 50_000
|
||||
OUTPUT_DRAIN_GRACE_S = 0.1
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
@ -139,6 +140,8 @@ class _ExecSession:
|
||||
asyncio.gather(self._stdout_task, self._stderr_task),
|
||||
timeout=2.0,
|
||||
)
|
||||
elif yield_time_ms > 0:
|
||||
await self._wait_for_buffered_output()
|
||||
|
||||
async with self._lock:
|
||||
output = "".join(self._chunks)
|
||||
@ -163,6 +166,14 @@ class _ExecSession:
|
||||
with suppress(asyncio.TimeoutError):
|
||||
await asyncio.wait_for(self.process.wait(), timeout=5.0)
|
||||
|
||||
async def _wait_for_buffered_output(self) -> None:
|
||||
deadline = time.monotonic() + OUTPUT_DRAIN_GRACE_S
|
||||
while time.monotonic() < deadline:
|
||||
async with self._lock:
|
||||
if self._chunks:
|
||||
return
|
||||
await asyncio.sleep(0.01)
|
||||
|
||||
|
||||
class ExecSessionManager:
|
||||
def __init__(self, *, max_sessions: int = 8, idle_timeout: int = 1800) -> None:
|
||||
|
||||
2
nanobot/audio/__init__.py
Normal file
2
nanobot/audio/__init__.py
Normal file
@ -0,0 +1,2 @@
|
||||
"""Shared audio service helpers."""
|
||||
|
||||
183
nanobot/audio/transcription.py
Normal file
183
nanobot/audio/transcription.py
Normal file
@ -0,0 +1,183 @@
|
||||
"""Application-level audio transcription service.
|
||||
|
||||
This module owns nanobot's transcription behavior: config resolution,
|
||||
legacy channel fallback, upload validation, temporary-file handling, and
|
||||
dispatch to provider adapters. It deliberately does not know provider-specific
|
||||
HTTP details; those live in ``nanobot.providers.transcription``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from contextlib import suppress
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from nanobot.config.paths import get_media_dir
|
||||
from nanobot.utils.media_decode import FileSizeExceeded, save_base64_data_url
|
||||
|
||||
TranscriptionProviderName = Literal["groq", "openai"]
|
||||
|
||||
_DEFAULT_PROVIDER: TranscriptionProviderName = "groq"
|
||||
_DEFAULT_MODELS: dict[TranscriptionProviderName, str] = {
|
||||
"groq": "whisper-large-v3",
|
||||
"openai": "whisper-1",
|
||||
}
|
||||
_MAX_AUDIO_BYTES_FALLBACK = 25 * 1024 * 1024
|
||||
_AUDIO_MIME_ALLOWED: frozenset[str] = frozenset({
|
||||
"audio/aac",
|
||||
"audio/flac",
|
||||
"audio/m4a",
|
||||
"audio/mp4",
|
||||
"audio/mpeg",
|
||||
"audio/ogg",
|
||||
"audio/wav",
|
||||
"audio/webm",
|
||||
"audio/x-m4a",
|
||||
"audio/x-wav",
|
||||
})
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EffectiveTranscriptionConfig:
|
||||
enabled: bool
|
||||
provider: TranscriptionProviderName
|
||||
model: str
|
||||
language: str | None
|
||||
api_key: str = field(repr=False)
|
||||
api_base: str
|
||||
max_duration_sec: int
|
||||
max_upload_mb: int
|
||||
|
||||
@property
|
||||
def configured(self) -> bool:
|
||||
return bool(self.api_key)
|
||||
|
||||
|
||||
class TranscriptionIngressError(Exception):
|
||||
"""Stable transcription upload error surfaced to WebUI clients."""
|
||||
|
||||
def __init__(self, detail: str, **extra: Any):
|
||||
super().__init__(detail)
|
||||
self.detail = detail
|
||||
self.extra = extra
|
||||
|
||||
|
||||
def _as_provider(value: Any) -> TranscriptionProviderName | None:
|
||||
if isinstance(value, str):
|
||||
name = value.strip().lower()
|
||||
if name in _DEFAULT_MODELS:
|
||||
return name # type: ignore[return-value]
|
||||
return None
|
||||
|
||||
|
||||
def _provider_config(config: Any, provider: str) -> Any:
|
||||
return getattr(getattr(config, "providers", None), provider, None)
|
||||
|
||||
|
||||
def _extract_data_url_mime(url: str) -> str | None:
|
||||
header, _, _ = url.partition(",")
|
||||
if not header.startswith("data:") or ";base64" not in header:
|
||||
return None
|
||||
return header[5:].split(";", 1)[0].strip().lower() or None
|
||||
|
||||
|
||||
def resolve_transcription_config(config: Any) -> EffectiveTranscriptionConfig:
|
||||
"""Resolve top-level transcription settings with legacy channel fallback."""
|
||||
top = getattr(config, "transcription", None)
|
||||
channels = getattr(config, "channels", None)
|
||||
provider = (
|
||||
_as_provider(getattr(top, "provider", None))
|
||||
or _as_provider(getattr(channels, "transcription_provider", None))
|
||||
or _DEFAULT_PROVIDER
|
||||
)
|
||||
provider_cfg = _provider_config(config, provider)
|
||||
return EffectiveTranscriptionConfig(
|
||||
enabled=bool(getattr(top, "enabled", True)),
|
||||
provider=provider,
|
||||
model=(getattr(top, "model", None) or _DEFAULT_MODELS[provider]).strip(),
|
||||
language=getattr(top, "language", None) or getattr(channels, "transcription_language", None),
|
||||
api_key=getattr(provider_cfg, "api_key", None) or "",
|
||||
api_base=getattr(provider_cfg, "api_base", None) or "",
|
||||
max_duration_sec=int(getattr(top, "max_duration_sec", 120)),
|
||||
max_upload_mb=int(getattr(top, "max_upload_mb", 25)),
|
||||
)
|
||||
|
||||
|
||||
async def transcribe_audio_data_url(
|
||||
data_url: Any,
|
||||
config: EffectiveTranscriptionConfig,
|
||||
*,
|
||||
duration_ms: Any = None,
|
||||
) -> str:
|
||||
"""Validate, persist, transcribe, and remove a WebUI audio data URL."""
|
||||
if not isinstance(data_url, str) or not data_url:
|
||||
raise TranscriptionIngressError("missing_audio")
|
||||
if not config.enabled:
|
||||
raise TranscriptionIngressError("disabled")
|
||||
if not config.configured:
|
||||
raise TranscriptionIngressError("not_configured", provider=config.provider)
|
||||
if (
|
||||
isinstance(duration_ms, (int, float))
|
||||
and duration_ms > (config.max_duration_sec * 1000 + 1000)
|
||||
):
|
||||
raise TranscriptionIngressError("duration")
|
||||
if _extract_data_url_mime(data_url) not in _AUDIO_MIME_ALLOWED:
|
||||
raise TranscriptionIngressError("mime")
|
||||
|
||||
audio_path: str | None = None
|
||||
max_bytes = max(
|
||||
1,
|
||||
config.max_upload_mb * 1024 * 1024 if config.max_upload_mb else _MAX_AUDIO_BYTES_FALLBACK,
|
||||
)
|
||||
try:
|
||||
audio_path = save_base64_data_url(
|
||||
data_url,
|
||||
get_media_dir("webui-transcription"),
|
||||
max_bytes=max_bytes,
|
||||
)
|
||||
except FileSizeExceeded as exc:
|
||||
raise TranscriptionIngressError("size") from exc
|
||||
except Exception as exc:
|
||||
logger.warning("transcription audio decode failed: {}", exc)
|
||||
if not audio_path:
|
||||
raise TranscriptionIngressError("decode")
|
||||
|
||||
try:
|
||||
text = await transcribe_audio_file(audio_path, config)
|
||||
finally:
|
||||
with suppress(OSError):
|
||||
Path(audio_path).unlink(missing_ok=True)
|
||||
if not text:
|
||||
raise TranscriptionIngressError("empty")
|
||||
return text
|
||||
|
||||
|
||||
async def transcribe_audio_file(
|
||||
file_path: str | Path,
|
||||
config: EffectiveTranscriptionConfig,
|
||||
) -> str:
|
||||
"""Transcribe *file_path* using the already-resolved transcription config."""
|
||||
if not config.enabled or not config.configured:
|
||||
return ""
|
||||
if config.provider == "openai":
|
||||
from nanobot.providers.transcription import OpenAITranscriptionProvider
|
||||
|
||||
provider = OpenAITranscriptionProvider(
|
||||
api_key=config.api_key,
|
||||
api_base=config.api_base or None,
|
||||
language=config.language,
|
||||
model=config.model,
|
||||
)
|
||||
else:
|
||||
from nanobot.providers.transcription import GroqTranscriptionProvider
|
||||
|
||||
provider = GroqTranscriptionProvider(
|
||||
api_key=config.api_key,
|
||||
api_base=config.api_base or None,
|
||||
language=config.language,
|
||||
model=config.model,
|
||||
)
|
||||
return await provider.transcribe(file_path)
|
||||
@ -28,10 +28,6 @@ class BaseChannel(ABC):
|
||||
|
||||
name: str = "base"
|
||||
display_name: str = "Base"
|
||||
transcription_provider: str = "groq"
|
||||
transcription_api_key: str = ""
|
||||
transcription_api_base: str = ""
|
||||
transcription_language: str | None = None
|
||||
send_progress: bool = True
|
||||
send_tool_hints: bool = False
|
||||
show_reasoning: bool = True
|
||||
@ -51,24 +47,14 @@ class BaseChannel(ABC):
|
||||
|
||||
async def transcribe_audio(self, file_path: str | Path) -> str:
|
||||
"""Transcribe an audio file via Whisper (OpenAI or Groq). Returns empty string on failure."""
|
||||
if not self.transcription_api_key:
|
||||
return ""
|
||||
try:
|
||||
if self.transcription_provider == "openai":
|
||||
from nanobot.providers.transcription import OpenAITranscriptionProvider
|
||||
provider = OpenAITranscriptionProvider(
|
||||
api_key=self.transcription_api_key,
|
||||
api_base=self.transcription_api_base or None,
|
||||
language=self.transcription_language or None,
|
||||
)
|
||||
else:
|
||||
from nanobot.providers.transcription import GroqTranscriptionProvider
|
||||
provider = GroqTranscriptionProvider(
|
||||
api_key=self.transcription_api_key,
|
||||
api_base=self.transcription_api_base or None,
|
||||
language=self.transcription_language or None,
|
||||
)
|
||||
return await provider.transcribe(file_path)
|
||||
from nanobot.audio.transcription import (
|
||||
resolve_transcription_config,
|
||||
transcribe_audio_file,
|
||||
)
|
||||
from nanobot.config.loader import load_config
|
||||
|
||||
return await transcribe_audio_file(file_path, resolve_transcription_config(load_config()))
|
||||
except Exception:
|
||||
self.logger.exception("Audio transcription failed")
|
||||
return ""
|
||||
|
||||
@ -80,11 +80,6 @@ class ChannelManager:
|
||||
"""Initialize channels discovered via pkgutil scan + entry_points plugins."""
|
||||
from nanobot.channels.registry import discover_channel_names, discover_enabled
|
||||
|
||||
transcription_provider = self.config.channels.transcription_provider
|
||||
transcription_key = self._resolve_transcription_key(transcription_provider)
|
||||
transcription_base = self._resolve_transcription_base(transcription_provider)
|
||||
transcription_language = self.config.channels.transcription_language
|
||||
|
||||
# Collect enabled module names first, then only import those.
|
||||
# Channel configs live in ChannelsConfig's extra fields (via
|
||||
# extra="allow"), so we enumerate candidates from pkgutil scan
|
||||
@ -135,10 +130,6 @@ class ChannelManager:
|
||||
)
|
||||
kwargs["gateway"] = gateway
|
||||
channel = cls(section, self.bus, **kwargs)
|
||||
channel.transcription_provider = transcription_provider
|
||||
channel.transcription_api_key = transcription_key
|
||||
channel.transcription_api_base = transcription_base
|
||||
channel.transcription_language = transcription_language
|
||||
channel.send_progress = self._resolve_bool_override(
|
||||
section, "send_progress", self.config.channels.send_progress,
|
||||
)
|
||||
@ -155,24 +146,6 @@ class ChannelManager:
|
||||
|
||||
self._validate_allow_from()
|
||||
|
||||
def _resolve_transcription_key(self, provider: str) -> str:
|
||||
"""Pick the API key for the configured transcription provider."""
|
||||
try:
|
||||
if provider == "openai":
|
||||
return self.config.providers.openai.api_key
|
||||
return self.config.providers.groq.api_key
|
||||
except AttributeError:
|
||||
return ""
|
||||
|
||||
def _resolve_transcription_base(self, provider: str) -> str:
|
||||
"""Pick the API base URL for the configured transcription provider."""
|
||||
try:
|
||||
if provider == "openai":
|
||||
return self.config.providers.openai.api_base or ""
|
||||
return self.config.providers.groq.api_base or ""
|
||||
except AttributeError:
|
||||
return ""
|
||||
|
||||
def _validate_allow_from(self) -> None:
|
||||
for name, ch in self.channels.items():
|
||||
cfg = ch.config
|
||||
|
||||
@ -45,6 +45,7 @@ from nanobot.webui.http_utils import (
|
||||
query_first as _query_first,
|
||||
)
|
||||
from nanobot.webui.mcp_presets_api import normalize_mcp_preset_mentions
|
||||
from nanobot.webui.transcription_ws import webui_transcription_event
|
||||
from nanobot.webui.websocket_logging import websockets_server_logger
|
||||
|
||||
|
||||
@ -235,7 +236,7 @@ _VIDEO_MIME_ALLOWED: frozenset[str] = frozenset({
|
||||
|
||||
_UPLOAD_MIME_ALLOWED: frozenset[str] = _IMAGE_MIME_ALLOWED | _VIDEO_MIME_ALLOWED
|
||||
|
||||
_DATA_URL_MIME_RE = re.compile(r"^data:([^;]+);base64,", re.DOTALL)
|
||||
_DATA_URL_MIME_RE = re.compile(r"^data:([^;,]+)(?:;[^,]*)*;base64,", re.DOTALL)
|
||||
|
||||
|
||||
def _extract_data_url_mime(url: str) -> str | None:
|
||||
@ -419,7 +420,6 @@ class WebSocketChannel(BaseChannel):
|
||||
return None
|
||||
|
||||
# -- Server lifecycle and connection ingress ---------------------------
|
||||
# -- Server lifecycle and connection ingress ---------------------------
|
||||
|
||||
async def start(self) -> None:
|
||||
from nanobot.utils.logging_bridge import redirect_lib_logging
|
||||
@ -703,6 +703,10 @@ class WebSocketChannel(BaseChannel):
|
||||
workspace_scope=scope.payload(),
|
||||
)
|
||||
return
|
||||
if t == "transcribe_audio":
|
||||
event, payload = await webui_transcription_event(envelope)
|
||||
await self._send_event(connection, event, **payload)
|
||||
return
|
||||
if t == "message":
|
||||
cid = envelope.get("chat_id")
|
||||
content = envelope.get("content")
|
||||
|
||||
@ -39,8 +39,19 @@ class ChannelsConfig(Base):
|
||||
show_reasoning: bool = True # surface model reasoning when channel implements it
|
||||
extract_document_text: bool = True # extract text from document attachments before sending to the model
|
||||
send_max_retries: int = Field(default=3, ge=0, le=10) # Max delivery attempts (initial send included)
|
||||
transcription_provider: str = "groq" # Voice transcription backend: "groq" or "openai"
|
||||
transcription_language: str | None = Field(default=None, pattern=r"^[a-z]{2,3}$") # Optional ISO-639-1 hint for audio transcription
|
||||
transcription_provider: str = "groq" # Deprecated: use top-level transcription.provider
|
||||
transcription_language: str | None = Field(default=None, pattern=r"^[a-z]{2,3}$") # Deprecated: use top-level transcription.language
|
||||
|
||||
|
||||
class TranscriptionConfig(Base):
|
||||
"""Cross-channel audio transcription configuration."""
|
||||
|
||||
enabled: bool = True
|
||||
provider: Literal["groq", "openai"] | None = None
|
||||
model: str | None = None
|
||||
language: str | None = Field(default=None, pattern=r"^[a-z]{2,3}$")
|
||||
max_duration_sec: int = Field(default=120, ge=1, le=600)
|
||||
max_upload_mb: int = Field(default=25, ge=1, le=100)
|
||||
|
||||
|
||||
class DreamConfig(Base):
|
||||
@ -167,7 +178,7 @@ class AgentsConfig(Base):
|
||||
class ProviderConfig(Base):
|
||||
"""LLM provider configuration."""
|
||||
|
||||
api_key: str | None = None
|
||||
api_key: str | None = Field(default=None, repr=False)
|
||||
api_base: str | None = None
|
||||
api_type: Literal["auto", "chat_completions", "responses"] = "auto" # Request API surface
|
||||
extra_headers: dict[str, str] | None = None # Custom headers (e.g. APP-Code for AiHubMix)
|
||||
@ -312,6 +323,7 @@ class Config(BaseSettings):
|
||||
|
||||
agents: AgentsConfig = Field(default_factory=AgentsConfig)
|
||||
channels: ChannelsConfig = Field(default_factory=ChannelsConfig)
|
||||
transcription: TranscriptionConfig = Field(default_factory=TranscriptionConfig)
|
||||
providers: ProvidersConfig = Field(default_factory=ProvidersConfig)
|
||||
api: ApiConfig = Field(default_factory=ApiConfig)
|
||||
gateway: GatewayConfig = Field(default_factory=GatewayConfig)
|
||||
|
||||
@ -1,6 +1,12 @@
|
||||
"""Voice transcription providers (Groq and OpenAI Whisper)."""
|
||||
"""Provider-specific voice transcription adapters.
|
||||
|
||||
This module only knows how to call external transcription APIs such as Groq
|
||||
and OpenAI Whisper. Product-level config fallback, WebUI upload validation,
|
||||
and channel integration live in ``nanobot.audio.transcription``.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import mimetypes
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
@ -8,6 +14,15 @@ import httpx
|
||||
from loguru import logger
|
||||
|
||||
_TRANSCRIPTIONS_PATH = "audio/transcriptions"
|
||||
_AUDIO_MIME_OVERRIDES = {
|
||||
".m4a": "audio/mp4",
|
||||
".mpga": "audio/mpeg",
|
||||
".ogg": "audio/ogg",
|
||||
".opus": "audio/ogg",
|
||||
".wav": "audio/wav",
|
||||
".weba": "audio/webm",
|
||||
".webm": "audio/webm",
|
||||
}
|
||||
|
||||
|
||||
def _resolve_transcription_url(api_base: str | None, default_url: str) -> str:
|
||||
@ -26,6 +41,14 @@ def _resolve_transcription_url(api_base: str | None, default_url: str) -> str:
|
||||
return f"{base}/{_TRANSCRIPTIONS_PATH}"
|
||||
|
||||
|
||||
def _audio_mime_type(path: Path) -> str:
|
||||
return (
|
||||
_AUDIO_MIME_OVERRIDES.get(path.suffix.lower())
|
||||
or mimetypes.guess_type(path.name)[0]
|
||||
or "application/octet-stream"
|
||||
)
|
||||
|
||||
|
||||
# Up to 3 retries (4 attempts total) with exponential backoff on transient
|
||||
# failures. Whisper endpoints occasionally return 502/503 under load, and
|
||||
# mobile-network transcription callers hit sporadic connect/read errors.
|
||||
@ -71,7 +94,7 @@ async def _post_transcription_with_retry(
|
||||
async with httpx.AsyncClient() as client:
|
||||
for attempt in range(_MAX_RETRIES + 1):
|
||||
files = {
|
||||
"file": (path.name, data),
|
||||
"file": (path.name, data, _audio_mime_type(path)),
|
||||
"model": (None, model),
|
||||
}
|
||||
if language:
|
||||
@ -113,6 +136,16 @@ async def _post_transcription_with_retry(
|
||||
|
||||
try:
|
||||
response.raise_for_status()
|
||||
except httpx.HTTPStatusError:
|
||||
body = response.text.strip().replace("\n", " ")[:500]
|
||||
logger.error(
|
||||
"{} transcription HTTP {}{}{}",
|
||||
provider_label,
|
||||
response.status_code,
|
||||
f" {response.reason_phrase}" if response.reason_phrase else "",
|
||||
f": {body}" if body else "",
|
||||
)
|
||||
return ""
|
||||
except Exception as e:
|
||||
logger.exception("{} transcription error: {}", provider_label, e)
|
||||
return ""
|
||||
@ -144,6 +177,7 @@ class OpenAITranscriptionProvider:
|
||||
api_key: str | None = None,
|
||||
api_base: str | None = None,
|
||||
language: str | None = None,
|
||||
model: str | None = None,
|
||||
):
|
||||
self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
|
||||
self.api_url = _resolve_transcription_url(
|
||||
@ -151,6 +185,7 @@ class OpenAITranscriptionProvider:
|
||||
"https://api.openai.com/v1/audio/transcriptions",
|
||||
)
|
||||
self.language = language or None
|
||||
self.model = model or "whisper-1"
|
||||
logger.debug("OpenAI transcription endpoint: {}", self.api_url)
|
||||
|
||||
async def transcribe(self, file_path: str | Path) -> str:
|
||||
@ -165,7 +200,7 @@ class OpenAITranscriptionProvider:
|
||||
self.api_url,
|
||||
api_key=self.api_key,
|
||||
path=path,
|
||||
model="whisper-1",
|
||||
model=self.model,
|
||||
provider_label="OpenAI",
|
||||
language=self.language,
|
||||
)
|
||||
@ -183,6 +218,7 @@ class GroqTranscriptionProvider:
|
||||
api_key: str | None = None,
|
||||
api_base: str | None = None,
|
||||
language: str | None = None,
|
||||
model: str | None = None,
|
||||
):
|
||||
self.api_key = api_key or os.environ.get("GROQ_API_KEY")
|
||||
self.api_url = _resolve_transcription_url(
|
||||
@ -190,6 +226,7 @@ class GroqTranscriptionProvider:
|
||||
"https://api.groq.com/openai/v1/audio/transcriptions",
|
||||
)
|
||||
self.language = language or None
|
||||
self.model = model or "whisper-large-v3"
|
||||
logger.debug("Groq transcription endpoint: {}", self.api_url)
|
||||
|
||||
async def transcribe(self, file_path: str | Path) -> str:
|
||||
@ -215,7 +252,7 @@ class GroqTranscriptionProvider:
|
||||
self.api_url,
|
||||
api_key=self.api_key,
|
||||
path=path,
|
||||
model="whisper-large-v3",
|
||||
model=self.model,
|
||||
provider_label="Groq",
|
||||
language=self.language,
|
||||
)
|
||||
|
||||
@ -18,13 +18,30 @@ from nanobot.utils.helpers import safe_filename
|
||||
DEFAULT_MAX_BYTES = 10 * 1024 * 1024
|
||||
MAX_FILE_SIZE = DEFAULT_MAX_BYTES
|
||||
|
||||
_DATA_URL_RE = re.compile(r"^data:([^;]+);base64,(.+)$", re.DOTALL)
|
||||
_DATA_URL_RE = re.compile(r"^data:([^;,]+)(?:;[^,]*)*;base64,(.+)$", re.DOTALL)
|
||||
_MIME_EXTENSION_OVERRIDES = {
|
||||
# Python's ``mimetypes`` maps browser-recorded audio/webm to ``.weba`` and
|
||||
# audio/ogg to ``.oga`` on macOS. Some transcription APIs validate by the
|
||||
# file extension and accept the canonical container extensions instead.
|
||||
"application/ogg": ".ogg",
|
||||
"audio/ogg": ".ogg",
|
||||
"audio/mpga": ".mpga",
|
||||
"audio/wav": ".wav",
|
||||
"audio/webm": ".webm",
|
||||
"audio/x-m4a": ".m4a",
|
||||
"audio/x-wav": ".wav",
|
||||
"audio/vnd.wave": ".wav",
|
||||
"video/webm": ".webm",
|
||||
}
|
||||
|
||||
|
||||
class FileSizeExceeded(Exception):
|
||||
class FileSizeExceededError(Exception):
|
||||
"""Raised when a decoded payload exceeds the caller's size limit."""
|
||||
|
||||
|
||||
FileSizeExceeded = FileSizeExceededError
|
||||
|
||||
|
||||
def save_base64_data_url(
|
||||
data_url: str,
|
||||
media_dir: Path,
|
||||
@ -40,7 +57,7 @@ def save_base64_data_url(
|
||||
m = _DATA_URL_RE.match(data_url)
|
||||
if not m:
|
||||
return None
|
||||
mime_type, b64_payload = m.group(1), m.group(2)
|
||||
mime_type, b64_payload = m.group(1).strip().lower(), m.group(2)
|
||||
try:
|
||||
raw = base64.b64decode(b64_payload)
|
||||
except Exception:
|
||||
@ -48,7 +65,7 @@ def save_base64_data_url(
|
||||
limit = DEFAULT_MAX_BYTES if max_bytes is None else max_bytes
|
||||
if len(raw) > limit:
|
||||
raise FileSizeExceeded(f"File exceeds {limit // (1024 * 1024)}MB limit")
|
||||
ext = mimetypes.guess_extension(mime_type) or ".bin"
|
||||
ext = _MIME_EXTENSION_OVERRIDES.get(mime_type) or mimetypes.guess_extension(mime_type) or ".bin"
|
||||
filename = f"{uuid.uuid4().hex[:12]}{ext}"
|
||||
dest = media_dir / safe_filename(filename)
|
||||
dest.write_bytes(raw)
|
||||
|
||||
@ -15,6 +15,7 @@ from zoneinfo import ZoneInfo
|
||||
|
||||
import httpx
|
||||
|
||||
from nanobot.audio.transcription import resolve_transcription_config
|
||||
from nanobot.config.loader import get_config_path, load_config, save_config
|
||||
from nanobot.config.schema import ModelPresetConfig
|
||||
from nanobot.providers.image_generation import (
|
||||
@ -90,6 +91,7 @@ _IMAGE_GENERATION_ASPECT_RATIOS = {
|
||||
"2:3",
|
||||
"21:9",
|
||||
}
|
||||
_TRANSCRIPTION_PROVIDERS = ("groq", "openai")
|
||||
_CONTEXT_WINDOW_TOKEN_OPTIONS = {65_536, 262_144}
|
||||
_MODEL_CONFIGURATION_SLUG_RE = re.compile(r"[^a-z0-9_-]+")
|
||||
_ENV_REF_RE = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}")
|
||||
@ -576,6 +578,22 @@ def _image_generation_provider_rows(config: Any) -> list[dict[str, Any]]:
|
||||
return rows
|
||||
|
||||
|
||||
def _transcription_provider_rows(config: Any) -> list[dict[str, Any]]:
|
||||
rows: list[dict[str, Any]] = []
|
||||
for name in _TRANSCRIPTION_PROVIDERS:
|
||||
spec = find_by_name(name)
|
||||
provider_config = getattr(config.providers, name, None)
|
||||
rows.append({
|
||||
"name": name,
|
||||
"label": spec.label if spec is not None else name,
|
||||
"configured": bool(getattr(provider_config, "api_key", None)),
|
||||
"api_key_hint": _mask_secret_hint(getattr(provider_config, "api_key", None)),
|
||||
"api_base": getattr(provider_config, "api_base", None),
|
||||
"default_api_base": spec.default_api_base if spec and spec.default_api_base else None,
|
||||
})
|
||||
return rows
|
||||
|
||||
|
||||
def settings_payload(
|
||||
*,
|
||||
requires_restart: bool = False,
|
||||
@ -633,6 +651,7 @@ def settings_payload(
|
||||
|
||||
search_config = config.tools.web.search
|
||||
image_config = config.tools.image_generation
|
||||
transcription = resolve_transcription_config(config)
|
||||
search_provider = (
|
||||
search_config.provider
|
||||
if search_config.provider in _WEB_SEARCH_PROVIDER_BY_NAME
|
||||
@ -733,6 +752,16 @@ def settings_payload(
|
||||
"save_dir": image_config.save_dir,
|
||||
"providers": image_providers,
|
||||
},
|
||||
"transcription": {
|
||||
"enabled": transcription.enabled,
|
||||
"provider": transcription.provider,
|
||||
"provider_configured": transcription.configured,
|
||||
"model": transcription.model,
|
||||
"language": transcription.language,
|
||||
"max_duration_sec": transcription.max_duration_sec,
|
||||
"max_upload_mb": transcription.max_upload_mb,
|
||||
"providers": _transcription_provider_rows(config),
|
||||
},
|
||||
"runtime": {
|
||||
"config_path": str(get_config_path().expanduser()),
|
||||
"workspace_path": str(config.workspace_path),
|
||||
@ -1311,3 +1340,71 @@ def update_image_generation_settings(query: QueryParams) -> dict[str, Any]:
|
||||
if changed:
|
||||
save_config(config)
|
||||
return settings_payload(requires_restart=changed)
|
||||
|
||||
|
||||
def update_transcription_settings(query: QueryParams) -> dict[str, Any]:
|
||||
config = load_config()
|
||||
transcription = config.transcription
|
||||
changed = False
|
||||
|
||||
enabled = _query_first(query, "enabled")
|
||||
if enabled is not None:
|
||||
parsed_enabled = _parse_bool(enabled, "enabled")
|
||||
if transcription.enabled != parsed_enabled:
|
||||
transcription.enabled = parsed_enabled
|
||||
changed = True
|
||||
|
||||
provider = _query_first(query, "provider")
|
||||
if provider is not None:
|
||||
provider = provider.strip().lower()
|
||||
if provider not in _TRANSCRIPTION_PROVIDERS:
|
||||
raise WebUISettingsError("unknown transcription provider")
|
||||
if transcription.provider != provider:
|
||||
transcription.provider = provider # type: ignore[assignment]
|
||||
changed = True
|
||||
|
||||
model = _query_first(query, "model")
|
||||
if model is not None:
|
||||
model = model.strip() or None
|
||||
if model is not None and len(model) > 200:
|
||||
raise WebUISettingsError("transcription model is too long")
|
||||
if transcription.model != model:
|
||||
transcription.model = model
|
||||
changed = True
|
||||
|
||||
language = _query_first(query, "language")
|
||||
if language is not None:
|
||||
language = language.strip().lower() or None
|
||||
if language is not None and not re.fullmatch(r"[a-z]{2,3}", language):
|
||||
raise WebUISettingsError("transcription language must be 2-3 lowercase letters")
|
||||
if transcription.language != language:
|
||||
transcription.language = language
|
||||
changed = True
|
||||
|
||||
max_duration_sec = _query_first_alias(query, "max_duration_sec", "maxDurationSec")
|
||||
if max_duration_sec is not None:
|
||||
try:
|
||||
parsed_duration = int(max_duration_sec)
|
||||
except ValueError:
|
||||
raise WebUISettingsError("max_duration_sec must be an integer") from None
|
||||
if parsed_duration < 1 or parsed_duration > 600:
|
||||
raise WebUISettingsError("max_duration_sec must be between 1 and 600")
|
||||
if transcription.max_duration_sec != parsed_duration:
|
||||
transcription.max_duration_sec = parsed_duration
|
||||
changed = True
|
||||
|
||||
max_upload_mb = _query_first_alias(query, "max_upload_mb", "maxUploadMb")
|
||||
if max_upload_mb is not None:
|
||||
try:
|
||||
parsed_upload = int(max_upload_mb)
|
||||
except ValueError:
|
||||
raise WebUISettingsError("max_upload_mb must be an integer") from None
|
||||
if parsed_upload < 1 or parsed_upload > 100:
|
||||
raise WebUISettingsError("max_upload_mb must be between 1 and 100")
|
||||
if transcription.max_upload_mb != parsed_upload:
|
||||
transcription.max_upload_mb = parsed_upload
|
||||
changed = True
|
||||
|
||||
if changed:
|
||||
save_config(config)
|
||||
return settings_payload()
|
||||
|
||||
@ -33,6 +33,7 @@ from nanobot.webui.settings_api import (
|
||||
update_model_configuration,
|
||||
update_network_safety_settings,
|
||||
update_provider_settings,
|
||||
update_transcription_settings,
|
||||
update_web_search_settings,
|
||||
)
|
||||
|
||||
@ -100,6 +101,8 @@ class WebUISettingsRouter:
|
||||
return self._handle_settings_web_search_update(request)
|
||||
if path == "/api/settings/image-generation/update":
|
||||
return self._handle_settings_image_generation_update(request)
|
||||
if path == "/api/settings/transcription/update":
|
||||
return self._handle_settings_transcription_update(request)
|
||||
if path == "/api/settings/network-safety/update":
|
||||
return self._handle_settings_network_safety_update(request)
|
||||
if path == "/api/settings/cli-apps":
|
||||
@ -275,6 +278,15 @@ class WebUISettingsRouter:
|
||||
return self._error_response(e.status, e.message)
|
||||
return self._json_response(self._with_restart_state(payload, section="image"))
|
||||
|
||||
def _handle_settings_transcription_update(self, request: WsRequest) -> Response:
|
||||
if not self._authorized(request):
|
||||
return self._unauthorized()
|
||||
try:
|
||||
payload = update_transcription_settings(self._query(request))
|
||||
except WebUISettingsError as e:
|
||||
return self._error_response(e.status, e.message)
|
||||
return self._json_response(self._with_restart_state(payload))
|
||||
|
||||
def _handle_settings_network_safety_update(self, request: WsRequest) -> Response:
|
||||
if not self._authorized(request):
|
||||
return self._unauthorized()
|
||||
|
||||
46
nanobot/webui/transcription_ws.py
Normal file
46
nanobot/webui/transcription_ws.py
Normal file
@ -0,0 +1,46 @@
|
||||
"""WebUI transcription envelope handling.
|
||||
|
||||
The WebSocket channel owns transport and subscription fan-out. This module owns
|
||||
the WebUI-specific audio transcription action carried over that socket.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from nanobot.audio.transcription import (
|
||||
TranscriptionIngressError,
|
||||
resolve_transcription_config,
|
||||
transcribe_audio_data_url,
|
||||
)
|
||||
from nanobot.config.loader import load_config
|
||||
|
||||
_MAX_REQUEST_ID_LENGTH = 80
|
||||
|
||||
|
||||
async def webui_transcription_event(envelope: dict[str, Any]) -> tuple[str, dict[str, Any]]:
|
||||
"""Return the WS event name and payload for one WebUI transcription request."""
|
||||
request_id = envelope.get("request_id")
|
||||
valid_request_id = (
|
||||
isinstance(request_id, str)
|
||||
and 0 < len(request_id) <= _MAX_REQUEST_ID_LENGTH
|
||||
)
|
||||
|
||||
def error(detail: str, **extra: Any) -> tuple[str, dict[str, Any]]:
|
||||
payload: dict[str, Any] = {"detail": detail, **extra}
|
||||
if valid_request_id:
|
||||
payload["request_id"] = request_id
|
||||
return "transcription_error", payload
|
||||
|
||||
if not valid_request_id:
|
||||
return error("invalid_request")
|
||||
|
||||
try:
|
||||
text = await transcribe_audio_data_url(
|
||||
envelope.get("data_url"),
|
||||
resolve_transcription_config(load_config()),
|
||||
duration_ms=envelope.get("duration_ms"),
|
||||
)
|
||||
except TranscriptionIngressError as exc:
|
||||
return error(exc.detail, **exc.extra)
|
||||
return "transcription_result", {"request_id": request_id, "text": text}
|
||||
@ -12,7 +12,8 @@ from nanobot.bus.events import OutboundMessage
|
||||
from nanobot.bus.queue import MessageBus
|
||||
from nanobot.channels.base import BaseChannel
|
||||
from nanobot.channels.manager import ChannelManager
|
||||
from nanobot.config.schema import ChannelsConfig
|
||||
from nanobot.config.loader import save_config
|
||||
from nanobot.config.schema import ChannelsConfig, Config
|
||||
from nanobot.providers.transcription import GroqTranscriptionProvider as _GroqProvider
|
||||
from nanobot.providers.transcription import OpenAITranscriptionProvider as _OpenAIProvider
|
||||
from nanobot.utils.restart import RestartNotice
|
||||
@ -238,102 +239,103 @@ async def test_manager_loads_plugin_from_dict_config():
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_manager_propagates_groq_transcription_api_base_to_channels():
|
||||
from nanobot.channels.manager import ChannelManager
|
||||
|
||||
fake_config = SimpleNamespace(
|
||||
channels=ChannelsConfig.model_validate({
|
||||
"fakeplugin": {"enabled": True, "allowFrom": ["*"]},
|
||||
"transcriptionLanguage": "en",
|
||||
}),
|
||||
providers=SimpleNamespace(
|
||||
groq=SimpleNamespace(api_key="groq-key", api_base="http://proxy.local/v1/audio/transcriptions"),
|
||||
openai=SimpleNamespace(api_key="openai-key", api_base="https://api.openai.com/v1/audio/transcriptions"),
|
||||
),
|
||||
)
|
||||
|
||||
with patch(
|
||||
"nanobot.channels.registry.discover_enabled",
|
||||
return_value={"fakeplugin": _FakePlugin},
|
||||
):
|
||||
mgr = ChannelManager.__new__(ChannelManager)
|
||||
mgr.config = fake_config
|
||||
mgr.bus = MessageBus()
|
||||
mgr.channels = {}
|
||||
mgr._dispatch_task = None
|
||||
mgr._init_channels()
|
||||
|
||||
channel = mgr.channels["fakeplugin"]
|
||||
assert channel.transcription_provider == "groq"
|
||||
assert channel.transcription_api_key == "groq-key"
|
||||
assert channel.transcription_api_base == "http://proxy.local/v1/audio/transcriptions"
|
||||
assert channel.transcription_language == "en"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_manager_propagates_openai_transcription_api_base_to_channels():
|
||||
from nanobot.channels.manager import ChannelManager
|
||||
|
||||
fake_config = SimpleNamespace(
|
||||
channels=ChannelsConfig.model_validate({
|
||||
"fakeplugin": {"enabled": True, "allowFrom": ["*"]},
|
||||
"transcriptionProvider": "openai",
|
||||
}),
|
||||
providers=SimpleNamespace(
|
||||
openai=SimpleNamespace(
|
||||
api_key="openai-key",
|
||||
api_base="http://proxy.local/v1/audio/transcriptions",
|
||||
),
|
||||
groq=SimpleNamespace(api_key="groq-key", api_base=""),
|
||||
),
|
||||
)
|
||||
|
||||
with patch(
|
||||
"nanobot.channels.registry.discover_enabled",
|
||||
return_value={"fakeplugin": _FakePlugin},
|
||||
):
|
||||
mgr = ChannelManager.__new__(ChannelManager)
|
||||
mgr.config = fake_config
|
||||
mgr.bus = MessageBus()
|
||||
mgr.channels = {}
|
||||
mgr._dispatch_task = None
|
||||
mgr._init_channels()
|
||||
|
||||
channel = mgr.channels["fakeplugin"]
|
||||
assert channel.transcription_provider == "openai"
|
||||
assert channel.transcription_api_key == "openai-key"
|
||||
assert channel.transcription_api_base == "http://proxy.local/v1/audio/transcriptions"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_base_channel_passes_api_base_to_openai_transcription_provider():
|
||||
"""BaseChannel.transcribe_audio must forward transcription_api_base to OpenAI."""
|
||||
async def test_base_channel_reads_current_transcription_config_each_call(
|
||||
tmp_path,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
"""BaseChannel.transcribe_audio resolves config at call time, not manager init time."""
|
||||
from nanobot.providers import transcription as transcription_mod
|
||||
|
||||
channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus())
|
||||
channel.transcription_provider = "openai"
|
||||
channel.transcription_api_key = "k"
|
||||
channel.transcription_api_base = "http://override/v1/audio/transcriptions"
|
||||
channel.transcription_language = "en"
|
||||
config_path = tmp_path / "config.json"
|
||||
config = Config()
|
||||
config.transcription.provider = "openai"
|
||||
config.transcription.model = "whisper-custom"
|
||||
config.transcription.language = "en"
|
||||
config.providers.openai.api_key = "openai-key"
|
||||
config.providers.openai.api_base = "http://openai.local/v1/audio/transcriptions"
|
||||
save_config(config, config_path)
|
||||
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
|
||||
|
||||
captured: dict[str, object] = {}
|
||||
channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus())
|
||||
|
||||
calls: list[dict[str, object]] = []
|
||||
|
||||
class _StubOpenAI:
|
||||
def __init__(self, api_key=None, api_base=None, language=None):
|
||||
captured["api_key"] = api_key
|
||||
captured["api_base"] = api_base
|
||||
captured["language"] = language
|
||||
def __init__(self, api_key=None, api_base=None, language=None, model=None):
|
||||
calls.append({
|
||||
"provider": "openai",
|
||||
"api_key": api_key,
|
||||
"api_base": api_base,
|
||||
"language": language,
|
||||
"model": model,
|
||||
})
|
||||
|
||||
async def transcribe(self, file_path):
|
||||
return "ok"
|
||||
return "openai-ok"
|
||||
|
||||
with patch.object(transcription_mod, "OpenAITranscriptionProvider", _StubOpenAI):
|
||||
result = await channel.transcribe_audio("/tmp/does-not-matter.wav")
|
||||
class _StubGroq:
|
||||
def __init__(self, api_key=None, api_base=None, language=None, model=None):
|
||||
calls.append({
|
||||
"provider": "groq",
|
||||
"api_key": api_key,
|
||||
"api_base": api_base,
|
||||
"language": language,
|
||||
"model": model,
|
||||
})
|
||||
|
||||
assert result == "ok"
|
||||
assert captured["api_key"] == "k"
|
||||
assert captured["api_base"] == "http://override/v1/audio/transcriptions"
|
||||
assert captured["language"] == "en"
|
||||
async def transcribe(self, file_path):
|
||||
return "groq-ok"
|
||||
|
||||
with (
|
||||
patch.object(transcription_mod, "OpenAITranscriptionProvider", _StubOpenAI),
|
||||
patch.object(transcription_mod, "GroqTranscriptionProvider", _StubGroq),
|
||||
):
|
||||
assert await channel.transcribe_audio("/tmp/does-not-matter.wav") == "openai-ok"
|
||||
|
||||
config.transcription.provider = "groq"
|
||||
config.transcription.model = "whisper-large-v3-turbo"
|
||||
config.transcription.language = "ko"
|
||||
config.providers.groq.api_key = "groq-key"
|
||||
config.providers.groq.api_base = "http://groq.local/v1/audio/transcriptions"
|
||||
save_config(config, config_path)
|
||||
|
||||
assert await channel.transcribe_audio("/tmp/does-not-matter.wav") == "groq-ok"
|
||||
|
||||
assert calls == [
|
||||
{
|
||||
"provider": "openai",
|
||||
"api_key": "openai-key",
|
||||
"api_base": "http://openai.local/v1/audio/transcriptions",
|
||||
"language": "en",
|
||||
"model": "whisper-custom",
|
||||
},
|
||||
{
|
||||
"provider": "groq",
|
||||
"api_key": "groq-key",
|
||||
"api_base": "http://groq.local/v1/audio/transcriptions",
|
||||
"language": "ko",
|
||||
"model": "whisper-large-v3-turbo",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_base_channel_respects_disabled_transcription_config(
|
||||
tmp_path,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
config_path = tmp_path / "config.json"
|
||||
config = Config()
|
||||
config.transcription.enabled = False
|
||||
config.providers.groq.api_key = "groq-key"
|
||||
save_config(config, config_path)
|
||||
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
|
||||
|
||||
channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus())
|
||||
|
||||
with patch("nanobot.providers.transcription.GroqTranscriptionProvider") as provider:
|
||||
assert await channel.transcribe_audio("/tmp/does-not-matter.wav") == ""
|
||||
provider.assert_not_called()
|
||||
|
||||
|
||||
def test_openai_transcription_provider_honors_api_base_argument():
|
||||
@ -348,37 +350,6 @@ def test_openai_transcription_provider_honors_api_base_argument():
|
||||
assert custom.api_url == "http://override/v1/audio/transcriptions"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_base_channel_passes_language_to_groq_transcription_provider():
|
||||
"""BaseChannel.transcribe_audio must forward transcription_language to Groq."""
|
||||
from nanobot.providers import transcription as transcription_mod
|
||||
|
||||
channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus())
|
||||
channel.transcription_provider = "groq"
|
||||
channel.transcription_api_key = "k"
|
||||
channel.transcription_api_base = "http://override/v1/audio/transcriptions"
|
||||
channel.transcription_language = "ko"
|
||||
|
||||
captured: dict[str, object] = {}
|
||||
|
||||
class _StubGroq:
|
||||
def __init__(self, api_key=None, api_base=None, language=None):
|
||||
captured["api_key"] = api_key
|
||||
captured["api_base"] = api_base
|
||||
captured["language"] = language
|
||||
|
||||
async def transcribe(self, file_path):
|
||||
return "ok"
|
||||
|
||||
with patch.object(transcription_mod, "GroqTranscriptionProvider", _StubGroq):
|
||||
result = await channel.transcribe_audio("/tmp/does-not-matter.wav")
|
||||
|
||||
assert result == "ok"
|
||||
assert captured["api_key"] == "k"
|
||||
assert captured["api_base"] == "http://override/v1/audio/transcriptions"
|
||||
assert captured["language"] == "ko"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Transcription provider HTTP tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@ -69,6 +69,7 @@ def _make_channel() -> WebSocketChannel:
|
||||
[
|
||||
("data:image/png;base64,AAAA", "image/png"),
|
||||
("data:image/jpeg;base64,AAAA", "image/jpeg"),
|
||||
("data:audio/webm;codecs=opus;base64,AAAA", "audio/webm"),
|
||||
("data:IMAGE/PNG;base64,AAAA", "image/png"),
|
||||
("data:image/svg+xml;base64,AAAA", "image/svg+xml"),
|
||||
("data:text/plain;base64,AAAA", "text/plain"),
|
||||
|
||||
@ -271,8 +271,6 @@ async def test_lid_to_phone_cache_resolves_lid_only_messages():
|
||||
async def test_voice_message_transcription_uses_media_path():
|
||||
"""Voice messages are transcribed when media path is available."""
|
||||
ch = WhatsAppChannel({"enabled": True, "allowFrom": ["*"]}, MagicMock())
|
||||
ch.transcription_provider = "openai"
|
||||
ch.transcription_api_key = "sk-test"
|
||||
ch._handle_message = AsyncMock()
|
||||
ch.transcribe_audio = AsyncMock(return_value="Hello world")
|
||||
|
||||
|
||||
@ -8,6 +8,8 @@ from unittest.mock import AsyncMock, patch
|
||||
import httpx
|
||||
import pytest
|
||||
|
||||
from nanobot.audio.transcription import resolve_transcription_config
|
||||
from nanobot.config.schema import Config
|
||||
from nanobot.providers.transcription import (
|
||||
GroqTranscriptionProvider,
|
||||
OpenAITranscriptionProvider,
|
||||
@ -33,6 +35,65 @@ def _raw_response(status: int, content: bytes) -> httpx.Response:
|
||||
return httpx.Response(status_code=status, content=content, request=request)
|
||||
|
||||
|
||||
def test_resolver_uses_legacy_channel_provider_when_top_level_is_unset() -> None:
|
||||
config = Config()
|
||||
config.channels.transcription_provider = "openai"
|
||||
config.channels.transcription_language = "en"
|
||||
config.providers.openai.api_key = "sk-test"
|
||||
config.providers.openai.api_base = "https://proxy.example/v1"
|
||||
|
||||
resolved = resolve_transcription_config(config)
|
||||
|
||||
assert resolved.provider == "openai"
|
||||
assert resolved.model == "whisper-1"
|
||||
assert resolved.language == "en"
|
||||
assert resolved.api_key == "sk-test"
|
||||
assert resolved.api_base == "https://proxy.example/v1"
|
||||
assert resolved.configured is True
|
||||
|
||||
|
||||
def test_resolver_prefers_top_level_transcription_over_legacy_channels() -> None:
|
||||
config = Config()
|
||||
config.channels.transcription_provider = "openai"
|
||||
config.channels.transcription_language = "en"
|
||||
config.transcription.provider = "groq"
|
||||
config.transcription.model = "whisper-large-v3-turbo"
|
||||
config.transcription.language = "ko"
|
||||
config.providers.groq.api_key = "gsk-test"
|
||||
config.providers.groq.api_base = "https://groq.example/openai/v1"
|
||||
|
||||
resolved = resolve_transcription_config(config)
|
||||
|
||||
assert resolved.provider == "groq"
|
||||
assert resolved.model == "whisper-large-v3-turbo"
|
||||
assert resolved.language == "ko"
|
||||
assert resolved.api_key == "gsk-test"
|
||||
assert resolved.api_base == "https://groq.example/openai/v1"
|
||||
|
||||
|
||||
def test_resolved_transcription_repr_hides_api_key() -> None:
|
||||
config = Config()
|
||||
config.providers.groq.api_key = "gsk-secret"
|
||||
|
||||
resolved = resolve_transcription_config(config)
|
||||
|
||||
assert "gsk-secret" not in repr(resolved)
|
||||
assert "api_key" not in repr(resolved)
|
||||
|
||||
|
||||
def test_resolver_keeps_enabled_and_limits_on_effective_config() -> None:
|
||||
config = Config()
|
||||
config.transcription.enabled = False
|
||||
config.transcription.max_duration_sec = 45
|
||||
config.transcription.max_upload_mb = 12
|
||||
|
||||
resolved = resolve_transcription_config(config)
|
||||
|
||||
assert resolved.enabled is False
|
||||
assert resolved.max_duration_sec == 45
|
||||
assert resolved.max_upload_mb == 12
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# OpenAI provider — retry on transient HTTP + network errors
|
||||
# ---------------------------------------------------------------------------
|
||||
@ -215,6 +276,32 @@ async def test_provider_omits_language_when_unset(
|
||||
assert "language" not in files
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_provider_forwards_custom_model_in_multipart(audio_file: Path) -> None:
|
||||
provider = GroqTranscriptionProvider(api_key="k", model="whisper-large-v3-turbo")
|
||||
post = AsyncMock(return_value=_response(200, {"text": "ok"}))
|
||||
with patch("httpx.AsyncClient.post", post), patch("asyncio.sleep", AsyncMock()):
|
||||
result = await provider.transcribe(audio_file)
|
||||
|
||||
assert result == "ok"
|
||||
files = post.await_args_list[0].kwargs["files"]
|
||||
assert files["model"] == (None, "whisper-large-v3-turbo")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_provider_forwards_file_mime_type(tmp_path: Path) -> None:
|
||||
audio = tmp_path / "voice.webm"
|
||||
audio.write_bytes(b"audio")
|
||||
provider = GroqTranscriptionProvider(api_key="k")
|
||||
post = AsyncMock(return_value=_response(200, {"text": "ok"}))
|
||||
with patch("httpx.AsyncClient.post", post), patch("asyncio.sleep", AsyncMock()):
|
||||
result = await provider.transcribe(audio)
|
||||
|
||||
assert result == "ok"
|
||||
files = post.await_args_list[0].kwargs["files"]
|
||||
assert files["file"] == ("voice.webm", b"audio", "audio/webm")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_language_survives_retry(audio_file: Path) -> None:
|
||||
"""Regression: language must be present on every retry attempt, not just the first."""
|
||||
|
||||
@ -6,8 +6,12 @@ import shlex
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
from nanobot.agent.tools.exec_session import (
|
||||
ExecSessionManager,
|
||||
ListExecSessionsTool,
|
||||
WriteStdinTool,
|
||||
)
|
||||
from nanobot.agent.tools.shell import ExecTool
|
||||
from nanobot.agent.tools.exec_session import ExecSessionManager, ListExecSessionsTool, WriteStdinTool
|
||||
|
||||
|
||||
def _python_command(code: str) -> str:
|
||||
@ -141,7 +145,7 @@ def test_exec_can_continue_with_stdin(tmp_path):
|
||||
return initial, result
|
||||
|
||||
initial, result = asyncio.run(run())
|
||||
assert "ready" in initial
|
||||
assert "ready" in initial + result
|
||||
assert "Process running" in initial
|
||||
assert "Elapsed:" in initial
|
||||
assert "got:ping" in result
|
||||
@ -170,7 +174,7 @@ def test_write_stdin_can_close_stdin(tmp_path):
|
||||
return initial, result
|
||||
|
||||
initial, result = asyncio.run(run())
|
||||
assert "ready" in initial
|
||||
assert "ready" in initial + result
|
||||
assert "got:payload" in result
|
||||
assert "Stdin closed." in result
|
||||
assert "Exit code: 0" in result
|
||||
@ -185,14 +189,20 @@ def test_write_stdin_can_terminate_session(tmp_path):
|
||||
"import time; print('ready', flush=True); time.sleep(30)"
|
||||
)
|
||||
|
||||
initial = await exec_tool.execute(command=command, yield_time_ms=500)
|
||||
initial = await exec_tool.execute(command=command, yield_time_ms=100)
|
||||
sid = _session_id(initial)
|
||||
waited = await stdin_tool.execute(
|
||||
session_id=sid,
|
||||
wait_for="ready",
|
||||
wait_timeout_ms=3000,
|
||||
yield_time_ms=0,
|
||||
)
|
||||
result = await stdin_tool.execute(
|
||||
session_id=sid,
|
||||
terminate=True,
|
||||
yield_time_ms=0,
|
||||
)
|
||||
return initial, result
|
||||
return initial + waited, result
|
||||
|
||||
initial, result = asyncio.run(run())
|
||||
assert "ready" in initial
|
||||
@ -243,7 +253,7 @@ def test_write_stdin_preserves_completed_session_output_until_polled(tmp_path):
|
||||
|
||||
initial, final = asyncio.run(run())
|
||||
|
||||
assert "ready" in initial
|
||||
assert "ready" in initial + final
|
||||
assert "done" in final
|
||||
assert "Exit code: 0" in final
|
||||
|
||||
|
||||
@ -8,8 +8,8 @@ import pytest
|
||||
|
||||
from nanobot.utils.media_decode import (
|
||||
DEFAULT_MAX_BYTES,
|
||||
FileSizeExceeded,
|
||||
MAX_FILE_SIZE,
|
||||
FileSizeExceeded,
|
||||
save_base64_data_url,
|
||||
)
|
||||
|
||||
@ -25,6 +25,31 @@ def test_saves_png_with_correct_extension(tmp_path) -> None:
|
||||
assert (tmp_path / result.split("/")[-1]).read_bytes() == b"fake png"
|
||||
|
||||
|
||||
def test_saves_data_url_with_mime_parameters(tmp_path) -> None:
|
||||
result = save_base64_data_url(_data_url(b"voice", mime="audio/webm;codecs=opus"), tmp_path)
|
||||
assert result is not None
|
||||
assert result.endswith(".webm")
|
||||
assert (tmp_path / result.split("/")[-1]).read_bytes() == b"voice"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("mime", "suffix"),
|
||||
[
|
||||
("audio/webm", ".webm"),
|
||||
("video/webm", ".webm"),
|
||||
("audio/ogg", ".ogg"),
|
||||
("audio/wav", ".wav"),
|
||||
("audio/mpga", ".mpga"),
|
||||
],
|
||||
)
|
||||
def test_saves_common_audio_with_api_friendly_extension(
|
||||
tmp_path, mime: str, suffix: str
|
||||
) -> None:
|
||||
result = save_base64_data_url(_data_url(b"voice", mime=mime), tmp_path)
|
||||
assert result is not None
|
||||
assert result.endswith(suffix)
|
||||
|
||||
|
||||
def test_returns_none_for_malformed_data_url(tmp_path) -> None:
|
||||
assert save_base64_data_url("not-a-data-url", tmp_path) is None
|
||||
|
||||
|
||||
@ -18,6 +18,7 @@ from nanobot.webui.settings_api import (
|
||||
update_agent_settings,
|
||||
update_model_configuration,
|
||||
update_network_safety_settings,
|
||||
update_transcription_settings,
|
||||
)
|
||||
|
||||
|
||||
@ -243,6 +244,75 @@ def test_settings_payload_includes_network_safety_fields(
|
||||
assert payload["advanced"]["ssrf_whitelist_count"] == 1
|
||||
|
||||
|
||||
def test_settings_payload_includes_effective_transcription_config(
|
||||
tmp_path,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
config_path = tmp_path / "config.json"
|
||||
config = Config()
|
||||
config.channels.transcription_provider = "openai"
|
||||
config.channels.transcription_language = "en"
|
||||
config.providers.openai.api_key = "sk-test"
|
||||
save_config(config, config_path)
|
||||
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
|
||||
|
||||
payload = settings_payload()
|
||||
|
||||
assert payload["transcription"]["enabled"] is True
|
||||
assert payload["transcription"]["provider"] == "openai"
|
||||
assert payload["transcription"]["provider_configured"] is True
|
||||
assert payload["transcription"]["model"] == "whisper-1"
|
||||
assert payload["transcription"]["language"] == "en"
|
||||
|
||||
|
||||
def test_update_transcription_settings_writes_top_level_only(
|
||||
tmp_path,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
config_path = tmp_path / "config.json"
|
||||
config = Config()
|
||||
config.channels.transcription_provider = "openai"
|
||||
config.channels.transcription_language = "en"
|
||||
config.providers.groq.api_key = "gsk-test"
|
||||
save_config(config, config_path)
|
||||
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
|
||||
|
||||
payload = update_transcription_settings(
|
||||
{
|
||||
"enabled": ["true"],
|
||||
"provider": ["groq"],
|
||||
"model": ["whisper-large-v3-turbo"],
|
||||
"language": ["ko"],
|
||||
"maxDurationSec": ["90"],
|
||||
"maxUploadMb": ["20"],
|
||||
}
|
||||
)
|
||||
|
||||
saved = load_config(config_path)
|
||||
assert saved.channels.transcription_provider == "openai"
|
||||
assert saved.channels.transcription_language == "en"
|
||||
assert saved.transcription.enabled is True
|
||||
assert saved.transcription.provider == "groq"
|
||||
assert saved.transcription.model == "whisper-large-v3-turbo"
|
||||
assert saved.transcription.language == "ko"
|
||||
assert saved.transcription.max_duration_sec == 90
|
||||
assert saved.transcription.max_upload_mb == 20
|
||||
assert payload["transcription"]["provider"] == "groq"
|
||||
assert payload["transcription"]["provider_configured"] is True
|
||||
|
||||
|
||||
def test_update_transcription_settings_validates_language(
|
||||
tmp_path,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
config_path = tmp_path / "config.json"
|
||||
save_config(Config(), config_path)
|
||||
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
|
||||
|
||||
with pytest.raises(WebUISettingsError, match="transcription language"):
|
||||
update_transcription_settings({"language": ["en-US"]})
|
||||
|
||||
|
||||
def test_settings_payload_includes_token_usage_summary(
|
||||
tmp_path,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
|
||||
129
tests/webui/test_transcription_ws.py
Normal file
129
tests/webui/test_transcription_ws.py
Normal file
@ -0,0 +1,129 @@
|
||||
"""Tests for WebUI transcription envelopes carried over the gateway socket."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
from nanobot.config.loader import save_config
|
||||
from nanobot.config.schema import Config
|
||||
from nanobot.webui.transcription_ws import webui_transcription_event
|
||||
|
||||
|
||||
def _audio_data_url(payload: bytes = b"voice", mime: str = "audio/webm") -> str:
|
||||
return f"data:{mime};base64,{base64.b64encode(payload).decode('ascii')}"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_webui_transcribe_audio_rejects_unconfigured_provider(
|
||||
tmp_path,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
config_path = tmp_path / "config.json"
|
||||
config = Config()
|
||||
config.transcription.provider = "groq"
|
||||
save_config(config, config_path)
|
||||
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
|
||||
|
||||
event, payload = await webui_transcription_event({
|
||||
"request_id": "voice-1",
|
||||
"data_url": _audio_data_url(),
|
||||
})
|
||||
|
||||
assert event == "transcription_error"
|
||||
assert payload == {
|
||||
"request_id": "voice-1",
|
||||
"detail": "not_configured",
|
||||
"provider": "groq",
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_webui_transcribe_audio_rejects_unsupported_mime(
|
||||
tmp_path,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
config_path = tmp_path / "config.json"
|
||||
config = Config()
|
||||
config.transcription.provider = "groq"
|
||||
config.providers.groq.api_key = "gsk-test"
|
||||
save_config(config, config_path)
|
||||
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
|
||||
|
||||
event, payload = await webui_transcription_event({
|
||||
"request_id": "voice-1",
|
||||
"data_url": _audio_data_url(mime="text/plain"),
|
||||
})
|
||||
|
||||
assert event == "transcription_error"
|
||||
assert payload["request_id"] == "voice-1"
|
||||
assert payload["detail"] == "mime"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_webui_transcribe_audio_rejects_oversized_audio(
|
||||
tmp_path,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
config_path = tmp_path / "config.json"
|
||||
config = Config()
|
||||
config.transcription.provider = "groq"
|
||||
config.transcription.max_upload_mb = 1
|
||||
config.providers.groq.api_key = "gsk-test"
|
||||
save_config(config, config_path)
|
||||
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
|
||||
monkeypatch.setattr("nanobot.audio.transcription.get_media_dir", lambda _channel=None: tmp_path)
|
||||
|
||||
event, payload = await webui_transcription_event({
|
||||
"request_id": "voice-1",
|
||||
"data_url": _audio_data_url(payload=b"x" * (1024 * 1024 + 1)),
|
||||
})
|
||||
|
||||
assert event == "transcription_error"
|
||||
assert payload["request_id"] == "voice-1"
|
||||
assert payload["detail"] == "size"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_webui_transcribe_audio_returns_text_and_removes_temp_file(
|
||||
tmp_path,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
config_path = tmp_path / "config.json"
|
||||
media_dir = tmp_path / "media"
|
||||
media_dir.mkdir()
|
||||
config = Config()
|
||||
config.transcription.provider = "groq"
|
||||
config.providers.groq.api_key = "gsk-test"
|
||||
save_config(config, config_path)
|
||||
monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
|
||||
monkeypatch.setattr(
|
||||
"nanobot.audio.transcription.get_media_dir",
|
||||
lambda _channel=None: media_dir,
|
||||
)
|
||||
captured_paths: list[Path] = []
|
||||
|
||||
async def fake_transcribe_audio_file(path: str | Path, _resolved: Any) -> str:
|
||||
p = Path(path)
|
||||
assert p.exists()
|
||||
captured_paths.append(p)
|
||||
return "hello voice"
|
||||
|
||||
monkeypatch.setattr(
|
||||
"nanobot.audio.transcription.transcribe_audio_file",
|
||||
fake_transcribe_audio_file,
|
||||
)
|
||||
|
||||
event, payload = await webui_transcription_event({
|
||||
"request_id": "voice-1",
|
||||
"data_url": _audio_data_url(payload=b"webm voice", mime="audio/webm;codecs=opus"),
|
||||
"duration_ms": 1200,
|
||||
})
|
||||
|
||||
assert event == "transcription_result"
|
||||
assert payload == {"request_id": "voice-1", "text": "hello voice"}
|
||||
assert captured_paths
|
||||
assert not captured_paths[0].exists()
|
||||
@ -81,6 +81,7 @@ const SETTINGS_SECTION_KEYS: SettingsSectionKey[] = [
|
||||
"appearance",
|
||||
"models",
|
||||
"image",
|
||||
"voice",
|
||||
"browser",
|
||||
"apps",
|
||||
"skills",
|
||||
|
||||
@ -1,8 +1,9 @@
|
||||
import { Suspense, lazy, useCallback, useState } from "react";
|
||||
import { Suspense, lazy, useCallback, useState, type ReactNode } from "react";
|
||||
import { Check, Copy } from "lucide-react";
|
||||
import { useTranslation } from "react-i18next";
|
||||
|
||||
import { useThemeValue } from "@/hooks/useTheme";
|
||||
import { hasAnsi, parseAnsiSegments, stripAnsi } from "@/lib/ansi";
|
||||
import { cn } from "@/lib/utils";
|
||||
|
||||
interface CodeBlockProps {
|
||||
@ -36,6 +37,10 @@ const CODE_FONT_STACK = [
|
||||
"monospace",
|
||||
].join(", ");
|
||||
|
||||
const ANSI_LANGUAGES = new Set(["ansi", "ansi-output"]);
|
||||
const CODE_SURFACE_LIGHT = "#f4f4f5";
|
||||
const CODE_SURFACE_DARK = "#27272a";
|
||||
|
||||
const LazyHighlightedCode = lazy(async () => {
|
||||
const [
|
||||
{ default: SyntaxHighlighter },
|
||||
@ -74,7 +79,11 @@ const LazyHighlightedCode = lazy(async () => {
|
||||
language={language || "text"}
|
||||
style={transparentTheme}
|
||||
customStyle={{
|
||||
background: chrome === "none" ? "transparent" : undefined,
|
||||
background: chrome === "none"
|
||||
? "transparent"
|
||||
: isDark
|
||||
? CODE_SURFACE_DARK
|
||||
: CODE_SURFACE_LIGHT,
|
||||
margin: 0,
|
||||
padding: chrome === "none" ? "0.75rem 1rem" : "1rem",
|
||||
fontFamily: CODE_FONT_STACK,
|
||||
@ -83,10 +92,10 @@ const LazyHighlightedCode = lazy(async () => {
|
||||
tabSize: 2,
|
||||
}}
|
||||
codeTagProps={{
|
||||
style: chrome === "none" ? {
|
||||
style: {
|
||||
background: "transparent",
|
||||
fontFamily: CODE_FONT_STACK,
|
||||
} : undefined,
|
||||
},
|
||||
}}
|
||||
lineNumberStyle={{
|
||||
minWidth: "2.6em",
|
||||
@ -106,14 +115,32 @@ const LazyHighlightedCode = lazy(async () => {
|
||||
};
|
||||
});
|
||||
|
||||
function PlainCodeFallback({
|
||||
function renderPlainText(value: string): ReactNode {
|
||||
return value;
|
||||
}
|
||||
|
||||
function renderAnsiText(value: string): ReactNode {
|
||||
return parseAnsiSegments(value).map((segment, index) => (
|
||||
<span key={index} style={segment.style}>
|
||||
{segment.text}
|
||||
</span>
|
||||
));
|
||||
}
|
||||
|
||||
function CodeTextBlock({
|
||||
code,
|
||||
chrome,
|
||||
showLineNumbers,
|
||||
testId,
|
||||
className,
|
||||
renderText = renderPlainText,
|
||||
}: {
|
||||
code: string;
|
||||
chrome: "default" | "none";
|
||||
showLineNumbers: boolean;
|
||||
testId: string;
|
||||
className?: string;
|
||||
renderText?: (value: string) => ReactNode;
|
||||
}) {
|
||||
const lines = code.split("\n");
|
||||
return (
|
||||
@ -121,10 +148,11 @@ function PlainCodeFallback({
|
||||
className={cn(
|
||||
"m-0 overflow-x-auto p-4 font-mono text-sm leading-[1.6] text-foreground/90",
|
||||
showLineNumbers ? "whitespace-pre" : "whitespace-pre-wrap",
|
||||
chrome === "default" ? "bg-background" : "bg-transparent",
|
||||
chrome === "default" ? "bg-zinc-100 dark:bg-zinc-800" : "bg-transparent",
|
||||
chrome === "none" && "p-3 text-[13px] leading-[1.55]",
|
||||
className,
|
||||
)}
|
||||
data-testid="plain-code-fallback"
|
||||
data-testid={testId}
|
||||
>
|
||||
<code className="text-inherit">
|
||||
{showLineNumbers ? (
|
||||
@ -133,16 +161,21 @@ function PlainCodeFallback({
|
||||
<span className="w-10 shrink-0 select-none pr-4 text-right text-muted-foreground/60">
|
||||
{index + 1}
|
||||
</span>
|
||||
<span className="whitespace-pre">{line || " "}</span>
|
||||
<span className="whitespace-pre">{renderText(line || " ")}</span>
|
||||
{index < lines.length - 1 ? "\n" : null}
|
||||
</span>
|
||||
))
|
||||
) : code}
|
||||
) : renderText(code)}
|
||||
</code>
|
||||
</pre>
|
||||
);
|
||||
}
|
||||
|
||||
function shouldRenderAnsi(language: string | undefined, code: string): boolean {
|
||||
const normalized = language?.trim().toLowerCase();
|
||||
return Boolean((normalized && ANSI_LANGUAGES.has(normalized)) || hasAnsi(code));
|
||||
}
|
||||
|
||||
export function CodeBlock({
|
||||
language,
|
||||
code,
|
||||
@ -156,19 +189,20 @@ export function CodeBlock({
|
||||
const [copied, setCopied] = useState(false);
|
||||
const isDark = useThemeValue() === "dark";
|
||||
const hasChrome = chrome === "default";
|
||||
const renderAnsi = shouldRenderAnsi(language, code);
|
||||
|
||||
const onCopy = useCallback(() => {
|
||||
if (!navigator.clipboard) return;
|
||||
navigator.clipboard.writeText(code).then(() => {
|
||||
navigator.clipboard.writeText(renderAnsi ? stripAnsi(code) : code).then(() => {
|
||||
setCopied(true);
|
||||
setTimeout(() => setCopied(false), 1_500);
|
||||
});
|
||||
}, [code]);
|
||||
}, [code, renderAnsi]);
|
||||
|
||||
return (
|
||||
<div
|
||||
className={cn(
|
||||
"overflow-hidden",
|
||||
"not-prose overflow-hidden",
|
||||
hasChrome && "rounded-lg border",
|
||||
hasChrome && (isDark ? "border-white/10" : "border-black/10"),
|
||||
className,
|
||||
@ -177,7 +211,7 @@ export function CodeBlock({
|
||||
{hasChrome ? (
|
||||
<div
|
||||
className={cn(
|
||||
"flex items-center justify-between px-4 py-1.5 text-xs font-medium",
|
||||
"flex items-center justify-between px-4 pb-1.5 pt-2 text-xs font-medium",
|
||||
isDark
|
||||
? "bg-zinc-800 text-zinc-300"
|
||||
: "bg-zinc-100 text-zinc-600",
|
||||
@ -206,13 +240,22 @@ export function CodeBlock({
|
||||
</button>
|
||||
</div>
|
||||
) : null}
|
||||
{highlight ? (
|
||||
{renderAnsi ? (
|
||||
<CodeTextBlock
|
||||
code={code}
|
||||
chrome={chrome}
|
||||
showLineNumbers={showLineNumbers}
|
||||
testId="ansi-code"
|
||||
renderText={renderAnsiText}
|
||||
/>
|
||||
) : highlight ? (
|
||||
<Suspense
|
||||
fallback={
|
||||
<PlainCodeFallback
|
||||
<CodeTextBlock
|
||||
code={code}
|
||||
chrome={chrome}
|
||||
showLineNumbers={showLineNumbers}
|
||||
testId="plain-code-fallback"
|
||||
/>
|
||||
}
|
||||
>
|
||||
@ -226,10 +269,11 @@ export function CodeBlock({
|
||||
/>
|
||||
</Suspense>
|
||||
) : (
|
||||
<PlainCodeFallback
|
||||
<CodeTextBlock
|
||||
code={code}
|
||||
chrome={chrome}
|
||||
showLineNumbers={showLineNumbers}
|
||||
testId="plain-code-fallback"
|
||||
/>
|
||||
)}
|
||||
</div>
|
||||
|
||||
@ -31,6 +31,7 @@ import {
|
||||
Layers,
|
||||
Loader2,
|
||||
LogOut,
|
||||
Mic,
|
||||
Moon,
|
||||
PlayCircle,
|
||||
Plus,
|
||||
@ -92,6 +93,7 @@ import {
|
||||
updateNetworkSafetySettings,
|
||||
updateProviderSettings,
|
||||
updateSettings,
|
||||
updateTranscriptionSettings,
|
||||
updateWebSearchSettings,
|
||||
} from "@/lib/api";
|
||||
import { notifyCliAppsChanged } from "@/lib/cli-app-events";
|
||||
@ -115,6 +117,7 @@ import type {
|
||||
ProviderModelsPayload,
|
||||
SettingsPayload,
|
||||
SkillSummary,
|
||||
TranscriptionSettingsUpdate,
|
||||
WebSearchSettingsUpdate,
|
||||
WebuiDefaultAccessMode,
|
||||
} from "@/lib/types";
|
||||
@ -124,6 +127,7 @@ export type SettingsSectionKey =
|
||||
| "appearance"
|
||||
| "models"
|
||||
| "image"
|
||||
| "voice"
|
||||
| "browser"
|
||||
| "apps"
|
||||
| "skills"
|
||||
@ -367,6 +371,26 @@ const DEFAULT_IMAGE_GENERATION_FORM: ImageGenerationSettingsUpdate = {
|
||||
maxImagesPerTurn: 4,
|
||||
};
|
||||
|
||||
const DEFAULT_TRANSCRIPTION_FORM: TranscriptionSettingsUpdate = {
|
||||
enabled: true,
|
||||
provider: "groq",
|
||||
model: "",
|
||||
language: "",
|
||||
maxDurationSec: 120,
|
||||
maxUploadMb: 25,
|
||||
};
|
||||
|
||||
const DEFAULT_TRANSCRIPTION_SETTINGS: NonNullable<SettingsPayload["transcription"]> = {
|
||||
enabled: true,
|
||||
provider: "groq",
|
||||
provider_configured: false,
|
||||
model: "whisper-large-v3",
|
||||
language: null,
|
||||
max_duration_sec: 120,
|
||||
max_upload_mb: 25,
|
||||
providers: [],
|
||||
};
|
||||
|
||||
const DEFAULT_NETWORK_SAFETY_FORM: NetworkSafetySettingsUpdate = {
|
||||
webuiAllowLocalServiceAccess: true,
|
||||
webuiDefaultAccessMode: "default",
|
||||
@ -419,6 +443,18 @@ function imageGenerationFormFromPayload(payload: SettingsPayload): ImageGenerati
|
||||
};
|
||||
}
|
||||
|
||||
function transcriptionFormFromPayload(payload: SettingsPayload): TranscriptionSettingsUpdate {
|
||||
const transcription = payload.transcription ?? DEFAULT_TRANSCRIPTION_SETTINGS;
|
||||
return {
|
||||
enabled: transcription.enabled,
|
||||
provider: transcription.provider,
|
||||
model: transcription.model,
|
||||
language: transcription.language ?? "",
|
||||
maxDurationSec: transcription.max_duration_sec,
|
||||
maxUploadMb: transcription.max_upload_mb,
|
||||
};
|
||||
}
|
||||
|
||||
function networkSafetyFormFromPayload(payload: SettingsPayload): NetworkSafetySettingsUpdate {
|
||||
return {
|
||||
webuiAllowLocalServiceAccess:
|
||||
@ -479,6 +515,7 @@ export function SettingsView({
|
||||
const [providerSaving, setProviderSaving] = useState<string | null>(null);
|
||||
const [webSearchSaving, setWebSearchSaving] = useState(false);
|
||||
const [imageGenerationSaving, setImageGenerationSaving] = useState(false);
|
||||
const [transcriptionSaving, setTranscriptionSaving] = useState(false);
|
||||
const [networkSafetySaving, setNetworkSafetySaving] = useState(false);
|
||||
const [hostEngineApplying, setHostEngineApplying] = useState(false);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
@ -511,6 +548,9 @@ export function SettingsView({
|
||||
? imageGenerationFormFromPayload(initialSettings)
|
||||
: DEFAULT_IMAGE_GENERATION_FORM,
|
||||
);
|
||||
const [transcriptionForm, setTranscriptionForm] = useState<TranscriptionSettingsUpdate>(
|
||||
() => initialSettings ? transcriptionFormFromPayload(initialSettings) : DEFAULT_TRANSCRIPTION_FORM,
|
||||
);
|
||||
const [networkSafetyForm, setNetworkSafetyForm] = useState<NetworkSafetySettingsUpdate>(() =>
|
||||
initialSettings ? networkSafetyFormFromPayload(initialSettings) : DEFAULT_NETWORK_SAFETY_FORM,
|
||||
);
|
||||
@ -543,6 +583,7 @@ export function SettingsView({
|
||||
setForm(agentDraftFromPayload(payload));
|
||||
setWebSearchForm((prev) => webSearchFormFromPayload(payload, prev));
|
||||
setImageGenerationForm(imageGenerationFormFromPayload(payload));
|
||||
setTranscriptionForm(transcriptionFormFromPayload(payload));
|
||||
setNetworkSafetyForm(networkSafetyFormFromPayload(payload));
|
||||
if (payload.restart_required_sections) {
|
||||
setPendingRestartSections(pendingRestartSectionsFromPayload(payload));
|
||||
@ -711,6 +752,19 @@ export function SettingsView({
|
||||
);
|
||||
}, [imageGenerationForm, settings]);
|
||||
|
||||
const transcriptionDirty = useMemo(() => {
|
||||
if (!settings) return false;
|
||||
const transcription = settings.transcription ?? DEFAULT_TRANSCRIPTION_SETTINGS;
|
||||
return (
|
||||
transcriptionForm.enabled !== transcription.enabled ||
|
||||
transcriptionForm.provider !== transcription.provider ||
|
||||
transcriptionForm.model !== transcription.model ||
|
||||
transcriptionForm.language !== (transcription.language ?? "") ||
|
||||
transcriptionForm.maxDurationSec !== transcription.max_duration_sec ||
|
||||
transcriptionForm.maxUploadMb !== transcription.max_upload_mb
|
||||
);
|
||||
}, [settings, transcriptionForm]);
|
||||
|
||||
const networkSafetyDirty = useMemo(() => {
|
||||
if (!settings) return false;
|
||||
const currentLocalServiceAccess =
|
||||
@ -913,6 +967,24 @@ export function SettingsView({
|
||||
}
|
||||
};
|
||||
|
||||
const saveTranscriptionSettings = async () => {
|
||||
if (!settings || !transcriptionDirty || transcriptionSaving) return;
|
||||
setTranscriptionSaving(true);
|
||||
try {
|
||||
const payload = await updateTranscriptionSettings(token, transcriptionForm);
|
||||
applyPayload(payload);
|
||||
if (payload.requires_restart) {
|
||||
setPendingRestartSections((prev) => ({ ...prev, browser: true }));
|
||||
}
|
||||
await maybeRestartHostEngine(payload);
|
||||
setError(null);
|
||||
} catch (err) {
|
||||
setError((err as Error).message);
|
||||
} finally {
|
||||
setTranscriptionSaving(false);
|
||||
}
|
||||
};
|
||||
|
||||
const saveNetworkSafetySettings = async () => {
|
||||
if (!settings || !networkSafetyDirty || networkSafetySaving) return;
|
||||
setNetworkSafetySaving(true);
|
||||
@ -1333,6 +1405,22 @@ export function SettingsView({
|
||||
requiresRestartPending={pendingRestartSections.image}
|
||||
/>
|
||||
);
|
||||
case "voice":
|
||||
return (
|
||||
<TranscriptionSettings
|
||||
settings={settings}
|
||||
form={transcriptionForm}
|
||||
dirty={transcriptionDirty}
|
||||
saving={transcriptionSaving}
|
||||
onChangeForm={setTranscriptionForm}
|
||||
onSave={saveTranscriptionSettings}
|
||||
onOpenProviders={() => selectSection("models")}
|
||||
showBrandLogos={localPrefs.brandLogos}
|
||||
onRestart={restartViaSettingsSurface}
|
||||
isRestarting={isRestarting || hostEngineApplying}
|
||||
requiresRestartPending={pendingRestartSections.browser}
|
||||
/>
|
||||
);
|
||||
case "browser":
|
||||
return (
|
||||
<WebSettings
|
||||
@ -1523,6 +1611,7 @@ const SETTINGS_NAV_ITEMS: Array<{ key: SettingsSectionKey; icon: LucideIcon; fal
|
||||
{ key: "appearance", icon: Palette, fallback: "Appearance" },
|
||||
{ key: "models", icon: SlidersHorizontal, fallback: "Models" },
|
||||
{ key: "image", icon: ImageIcon, fallback: "Image" },
|
||||
{ key: "voice", icon: Mic, fallback: "Voice" },
|
||||
{ key: "browser", icon: Globe2, fallback: "Web" },
|
||||
{ key: "runtime", icon: Server, fallback: "System" },
|
||||
{ key: "advanced", icon: ShieldCheck, fallback: "Security" },
|
||||
@ -1642,6 +1731,24 @@ function OverviewSettings({
|
||||
const webStatus = settings.web.enable
|
||||
? tx("settings.values.enabled", "Enabled")
|
||||
: tx("settings.values.disabled", "Disabled");
|
||||
const webSearchProvider =
|
||||
settings.web_search.providers.find((provider) => provider.name === settings.web_search.provider) ??
|
||||
settings.web_search.providers[0];
|
||||
const webSearchProviderLabel = providerDisplayLabel(
|
||||
settings.web_search.providers,
|
||||
settings.web_search.provider,
|
||||
);
|
||||
const webSearchCredentialStatus =
|
||||
webSearchProvider?.credential === "none"
|
||||
? tx("settings.byok.webSearch.noCredentialRequired", "No key required")
|
||||
: webSearchProvider?.credential === "base_url"
|
||||
? settings.web_search.base_url
|
||||
? tx("settings.values.configured", "Configured")
|
||||
: tx("settings.values.notConfigured", "Not configured")
|
||||
: settings.web_search.api_key_hint
|
||||
? tx("settings.values.configured", "Configured")
|
||||
: tx("settings.values.notConfigured", "Not configured");
|
||||
const webCaption = `${webSearchProviderLabel} · ${webSearchCredentialStatus}`;
|
||||
const imageStatus = settings.image_generation.enabled
|
||||
? tx("settings.values.enabled", "Enabled")
|
||||
: tx("settings.values.disabled", "Disabled");
|
||||
@ -1650,6 +1757,15 @@ function OverviewSettings({
|
||||
? tx("settings.values.configured", "Configured")
|
||||
: tx("settings.values.notConfigured", "Not configured")
|
||||
}`;
|
||||
const transcription = settings.transcription ?? DEFAULT_TRANSCRIPTION_SETTINGS;
|
||||
const voiceStatus = transcription.enabled
|
||||
? tx("settings.values.enabled", "Enabled")
|
||||
: tx("settings.values.disabled", "Disabled");
|
||||
const voiceCaption = `${providerDisplayLabel(transcription.providers, transcription.provider)} · ${
|
||||
transcription.provider_configured
|
||||
? tx("settings.values.configured", "Configured")
|
||||
: tx("settings.values.notConfigured", "Not configured")
|
||||
}`;
|
||||
const isNativeHost = (settings.surface ?? settings.runtime_surface) === "native";
|
||||
const workspaceCaption = shortWorkspacePath(settings.runtime.workspace_path);
|
||||
const runtimeTitle = isNativeHost
|
||||
@ -1691,8 +1807,8 @@ function OverviewSettings({
|
||||
icon={Globe2}
|
||||
valueLogoProvider={settings.web_search.provider}
|
||||
title={tx("settings.overview.webSearch", "Web search")}
|
||||
value={providerDisplayLabel(settings.web_search.providers, settings.web_search.provider)}
|
||||
caption={webStatus}
|
||||
value={webStatus}
|
||||
caption={webCaption}
|
||||
showBrandLogos={showBrandLogos}
|
||||
onClick={() => onSelectSection("browser")}
|
||||
/>
|
||||
@ -1705,6 +1821,15 @@ function OverviewSettings({
|
||||
showBrandLogos={showBrandLogos}
|
||||
onClick={() => onSelectSection("image")}
|
||||
/>
|
||||
<OverviewListRow
|
||||
icon={Mic}
|
||||
valueLogoProvider={transcription.provider}
|
||||
title={tx("settings.overview.voiceInput", "Voice input")}
|
||||
value={voiceStatus}
|
||||
caption={voiceCaption}
|
||||
showBrandLogos={showBrandLogos}
|
||||
onClick={() => onSelectSection("voice")}
|
||||
/>
|
||||
</SettingsGroup>
|
||||
</section>
|
||||
|
||||
@ -2654,6 +2779,137 @@ function ImageGenerationSettings({
|
||||
);
|
||||
}
|
||||
|
||||
function TranscriptionSettings({
|
||||
settings,
|
||||
form,
|
||||
dirty,
|
||||
saving,
|
||||
onChangeForm,
|
||||
onSave,
|
||||
onOpenProviders,
|
||||
showBrandLogos,
|
||||
onRestart,
|
||||
isRestarting,
|
||||
requiresRestartPending,
|
||||
}: {
|
||||
settings: SettingsPayload;
|
||||
form: TranscriptionSettingsUpdate;
|
||||
dirty: boolean;
|
||||
saving: boolean;
|
||||
onChangeForm: Dispatch<SetStateAction<TranscriptionSettingsUpdate>>;
|
||||
onSave: () => void;
|
||||
onOpenProviders: () => void;
|
||||
showBrandLogos: boolean;
|
||||
onRestart?: () => void;
|
||||
isRestarting?: boolean;
|
||||
requiresRestartPending: boolean;
|
||||
}) {
|
||||
const { t } = useTranslation();
|
||||
const tx = (key: string, fallback: string) => t(key, { defaultValue: fallback });
|
||||
const transcription = settings.transcription ?? DEFAULT_TRANSCRIPTION_SETTINGS;
|
||||
const selectedProvider =
|
||||
transcription.providers.find((provider) => provider.name === form.provider) ??
|
||||
transcription.providers[0];
|
||||
const providerConfigured = !!selectedProvider?.configured;
|
||||
|
||||
return (
|
||||
<section>
|
||||
<SettingsSectionTitle>{tx("settings.sections.voiceInput", "Voice input")}</SettingsSectionTitle>
|
||||
<SettingsGroup>
|
||||
<SettingsRow
|
||||
title={tx("settings.rows.transcription", "Transcription")}
|
||||
description={tx("settings.help.transcription", "Transcribe microphone input before sending it. Chat channel voice messages use the same settings.")}
|
||||
>
|
||||
<ToggleButton
|
||||
checked={form.enabled}
|
||||
onChange={(enabled) => onChangeForm((prev) => ({ ...prev, enabled }))}
|
||||
ariaLabel={tx("settings.rows.transcription", "Transcription")}
|
||||
label={form.enabled ? tx("settings.values.on", "On") : tx("settings.values.off", "Off")}
|
||||
/>
|
||||
</SettingsRow>
|
||||
<SettingsRow
|
||||
title={tx("settings.rows.transcriptionProvider", "Provider")}
|
||||
description={tx("settings.help.transcriptionProvider", "Uses the matching provider credentials from Providers.")}
|
||||
>
|
||||
<ProviderPicker
|
||||
providers={transcription.providers}
|
||||
value={form.provider}
|
||||
emptyLabel={tx("settings.voice.selectProvider", "Select provider")}
|
||||
showProviderLogos={showBrandLogos}
|
||||
onChange={(provider) => onChangeForm((prev) => ({ ...prev, provider }))}
|
||||
/>
|
||||
</SettingsRow>
|
||||
<SettingsRow
|
||||
title={tx("settings.rows.transcriptionProviderStatus", "Provider status")}
|
||||
description={tx("settings.help.transcriptionProviderStatus", "API keys stay under providers, not in transcription settings.")}
|
||||
>
|
||||
<div className="flex flex-wrap items-center justify-end gap-2">
|
||||
<StatusPill tone={providerConfigured ? "success" : "neutral"}>
|
||||
{providerConfigured
|
||||
? tx("settings.values.configured", "Configured")
|
||||
: tx("settings.values.notConfigured", "Not configured")}
|
||||
</StatusPill>
|
||||
{!providerConfigured ? (
|
||||
<Button size="sm" variant="outline" onClick={onOpenProviders} className="rounded-full">
|
||||
{tx("settings.voice.configureProvider", "Configure provider")}
|
||||
</Button>
|
||||
) : null}
|
||||
</div>
|
||||
</SettingsRow>
|
||||
<SettingsRow
|
||||
title={tx("settings.rows.transcriptionModel", "Model")}
|
||||
description={tx("settings.help.transcriptionModel", "Leave as the resolved default unless your provider needs a custom model id.")}
|
||||
>
|
||||
<Input
|
||||
value={form.model}
|
||||
onChange={(event) => onChangeForm((prev) => ({ ...prev, model: event.target.value }))}
|
||||
className="h-8 w-[min(300px,70vw)] rounded-full text-[13px]"
|
||||
/>
|
||||
</SettingsRow>
|
||||
<SettingsRow
|
||||
title={tx("settings.rows.transcriptionLanguage", "Language")}
|
||||
description={tx("settings.help.transcriptionLanguage", "Optional ISO-639 hint such as en, zh, ja, or ko.")}
|
||||
>
|
||||
<Input
|
||||
value={form.language}
|
||||
onChange={(event) => onChangeForm((prev) => ({ ...prev, language: event.target.value }))}
|
||||
placeholder={tx("settings.voice.languageAuto", "Auto")}
|
||||
className="h-8 w-[min(180px,60vw)] rounded-full text-[13px]"
|
||||
/>
|
||||
</SettingsRow>
|
||||
<SettingsRow title={tx("settings.rows.voiceLimits", "Limits")}>
|
||||
<div className="flex flex-wrap justify-end gap-2">
|
||||
<NumberInput
|
||||
value={form.maxDurationSec}
|
||||
min={1}
|
||||
max={600}
|
||||
suffix="s"
|
||||
onChange={(maxDurationSec) => onChangeForm((prev) => ({ ...prev, maxDurationSec }))}
|
||||
/>
|
||||
<NumberInput
|
||||
value={form.maxUploadMb}
|
||||
min={1}
|
||||
max={100}
|
||||
suffix="MB"
|
||||
onChange={(maxUploadMb) => onChangeForm((prev) => ({ ...prev, maxUploadMb }))}
|
||||
/>
|
||||
</div>
|
||||
</SettingsRow>
|
||||
<RestartSettingsFooter
|
||||
dirty={dirty}
|
||||
saving={saving}
|
||||
pendingRestart={requiresRestartPending}
|
||||
dirtyMessage={tx("settings.status.restartAfterSaving", "Save changes, then restart when ready.")}
|
||||
pendingMessage={tx("settings.status.savedRestartApply", "Saved. Restart when ready.")}
|
||||
onSave={onSave}
|
||||
onRestart={onRestart}
|
||||
isRestarting={isRestarting}
|
||||
/>
|
||||
</SettingsGroup>
|
||||
</section>
|
||||
);
|
||||
}
|
||||
|
||||
function WebSettings({
|
||||
settings,
|
||||
form,
|
||||
|
||||
@ -78,16 +78,13 @@ function buildTokenUsageCalendar(
|
||||
const today = utcDateFromIsoDay(isoDayInTimeZone(new Date(), timeZone));
|
||||
const end = addUtcDays(today, 6 - today.getUTCDay());
|
||||
const start = addUtcDays(end, -(TOKEN_HEATMAP_CELLS - 1));
|
||||
const seenMonths = new Set<string>();
|
||||
const monthLabels: TokenUsageMonthLabel[] = [];
|
||||
|
||||
const cells = Array.from({ length: TOKEN_HEATMAP_CELLS }, (_, index) => {
|
||||
const date = addUtcDays(start, index);
|
||||
const key = isoDay(date);
|
||||
const row = byDate.get(key);
|
||||
const monthKey = key.slice(0, 7);
|
||||
if (!seenMonths.has(monthKey)) {
|
||||
seenMonths.add(monthKey);
|
||||
if (date.getUTCDate() === 1) {
|
||||
monthLabels.push({
|
||||
label: monthFormatter.format(date),
|
||||
column: Math.floor(index / 7) + 1,
|
||||
@ -186,16 +183,12 @@ export function TokenUsageHeatmap({
|
||||
{tx("settings.usage.shortTitle", "Token Usage")}
|
||||
</span>
|
||||
</div>
|
||||
<div
|
||||
className="mb-2 grid min-h-4 gap-1.5 text-[10px] font-normal leading-4 text-muted-foreground/62"
|
||||
style={{ gridTemplateColumns: `repeat(${TOKEN_HEATMAP_COLUMNS}, minmax(0, 1fr))` }}
|
||||
aria-hidden
|
||||
>
|
||||
<div className="relative mb-2 h-4 text-[10px] font-normal leading-4 text-muted-foreground/62" aria-hidden>
|
||||
{monthLabels.map((month) => (
|
||||
<span
|
||||
key={`${month.label}-${month.column}`}
|
||||
className="whitespace-nowrap"
|
||||
style={{ gridColumnStart: month.column, gridColumnEnd: "span 4" }}
|
||||
className="absolute top-0 whitespace-nowrap"
|
||||
style={{ left: `${((month.column - 1) / TOKEN_HEATMAP_COLUMNS) * 100}%` }}
|
||||
>
|
||||
{month.label}
|
||||
</span>
|
||||
|
||||
@ -31,6 +31,7 @@ import {
|
||||
History,
|
||||
ImageIcon,
|
||||
Loader2,
|
||||
Mic,
|
||||
Plus,
|
||||
RotateCw,
|
||||
Shield,
|
||||
@ -46,6 +47,12 @@ import {
|
||||
import { useTranslation } from "react-i18next";
|
||||
|
||||
import { Button } from "@/components/ui/button";
|
||||
import {
|
||||
Tooltip,
|
||||
TooltipContent,
|
||||
TooltipProvider,
|
||||
TooltipTrigger,
|
||||
} from "@/components/ui/tooltip";
|
||||
import {
|
||||
WorkspaceAccessMenu,
|
||||
WorkspaceProjectPicker,
|
||||
@ -59,6 +66,7 @@ import {
|
||||
} from "@/hooks/useAttachedImages";
|
||||
import { useClipboardAndDrop } from "@/hooks/useClipboardAndDrop";
|
||||
import type { SendImage, SendOptions } from "@/hooks/useNanobotStream";
|
||||
import { useVoiceRecorder, type VoiceRecorderErrorKey } from "@/hooks/useVoiceRecorder";
|
||||
import type {
|
||||
CliAppInfo,
|
||||
GoalStateWsPayload,
|
||||
@ -79,6 +87,9 @@ import { cn } from "@/lib/utils";
|
||||
/** ``<input accept>``: aligned with the server's MIME whitelist. SVG is
|
||||
* deliberately excluded to avoid an embedded-script XSS surface. */
|
||||
const ACCEPT_ATTR = "image/png,image/jpeg,image/webp,image/gif";
|
||||
const VOICE_SHORTCUT_CODE = "KeyD";
|
||||
const VOICE_SHORTCUT_ARIA = "Control+Shift+D";
|
||||
type VoiceShortcutPlatform = "apple" | "chromeos" | "linux" | "other" | "windows";
|
||||
|
||||
function formatBytes(n: number): string {
|
||||
if (n < 1024) return `${n} B`;
|
||||
@ -86,6 +97,54 @@ function formatBytes(n: number): string {
|
||||
return `${(n / (1024 * 1024)).toFixed(1)} MB`;
|
||||
}
|
||||
|
||||
function isVoiceShortcutDown(event: KeyboardEvent): boolean {
|
||||
return (
|
||||
event.code === VOICE_SHORTCUT_CODE
|
||||
&& event.ctrlKey
|
||||
&& event.shiftKey
|
||||
&& !event.altKey
|
||||
&& !event.metaKey
|
||||
);
|
||||
}
|
||||
|
||||
function isVoiceShortcutRelease(event: KeyboardEvent): boolean {
|
||||
return (
|
||||
event.code === VOICE_SHORTCUT_CODE
|
||||
|| event.key === "Control"
|
||||
|| event.key === "Shift"
|
||||
);
|
||||
}
|
||||
|
||||
function getVoiceShortcutPlatform(): VoiceShortcutPlatform {
|
||||
if (typeof navigator === "undefined") return "other";
|
||||
const userAgentData = (navigator as Navigator & { userAgentData?: { platform?: string } })
|
||||
.userAgentData;
|
||||
const platform = [
|
||||
userAgentData?.platform,
|
||||
navigator.platform,
|
||||
navigator.userAgent,
|
||||
].filter(Boolean).join(" ").toLowerCase();
|
||||
const isIpadPretendingToBeMac =
|
||||
navigator.platform === "MacIntel" && navigator.maxTouchPoints > 1;
|
||||
if (isIpadPretendingToBeMac || /mac|iphone|ipad|ipod/.test(platform)) return "apple";
|
||||
if (/win/.test(platform)) return "windows";
|
||||
if (/cros/.test(platform)) return "chromeos";
|
||||
if (/linux|x11|android/.test(platform)) return "linux";
|
||||
return "other";
|
||||
}
|
||||
|
||||
function getVoiceShortcutLabel(): string {
|
||||
switch (getVoiceShortcutPlatform()) {
|
||||
case "apple":
|
||||
return "⌃⇧D";
|
||||
case "chromeos":
|
||||
case "linux":
|
||||
case "windows":
|
||||
case "other":
|
||||
return "Ctrl ⇧ D";
|
||||
}
|
||||
}
|
||||
|
||||
interface ThreadComposerProps {
|
||||
onSend: (content: string, images?: SendImage[], options?: SendOptions) => void;
|
||||
disabled?: boolean;
|
||||
@ -101,6 +160,7 @@ interface ThreadComposerProps {
|
||||
cliApps?: CliAppInfo[];
|
||||
mcpPresets?: McpPresetInfo[];
|
||||
onStop?: () => void;
|
||||
onTranscribeAudio?: (dataUrl: string, options?: { durationMs?: number }) => Promise<string>;
|
||||
/** Unix seconds from server; turn elapsed timer above input while set. */
|
||||
runStartedAt?: number | null;
|
||||
/** Sustained objective for this chat (WebSocket ``goal_state``). */
|
||||
@ -138,6 +198,45 @@ const QUEUED_PROMPTS_STORAGE_PREFIX = "nanobot.webui.composerQueuedGuidance.v1:"
|
||||
const QUEUED_PROMPTS_LIMIT = 20;
|
||||
const QUEUED_PROMPT_MAX_CHARS = 4000;
|
||||
|
||||
function VoiceRecordingMeter({
|
||||
ariaLabel,
|
||||
className,
|
||||
elapsedLabel,
|
||||
isHero,
|
||||
levels,
|
||||
}: {
|
||||
ariaLabel: string;
|
||||
className?: string;
|
||||
elapsedLabel: string;
|
||||
isHero: boolean;
|
||||
levels: number[];
|
||||
}) {
|
||||
return (
|
||||
<div
|
||||
className={cn(
|
||||
"flex min-w-0 items-center gap-2 text-neutral-700 dark:text-white",
|
||||
isHero ? "h-8" : "h-9",
|
||||
className,
|
||||
)}
|
||||
aria-live="polite"
|
||||
aria-label={ariaLabel}
|
||||
>
|
||||
<span className="flex h-5 min-w-0 flex-1 items-center justify-between overflow-hidden" aria-hidden>
|
||||
{levels.map((height, index) => (
|
||||
<span
|
||||
key={index}
|
||||
className="w-[2px] rounded-full bg-current opacity-85 transition-[height] duration-75 ease-linear motion-reduce:transition-none"
|
||||
style={{ height }}
|
||||
/>
|
||||
))}
|
||||
</span>
|
||||
<span className="min-w-[2.1rem] text-right text-[12px] font-medium tabular-nums text-muted-foreground">
|
||||
{elapsedLabel}
|
||||
</span>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
type SlashPalettePlacement = "above" | "below";
|
||||
|
||||
interface SlashPaletteLayout {
|
||||
@ -656,6 +755,7 @@ export function ThreadComposer({
|
||||
cliApps = [],
|
||||
mcpPresets = [],
|
||||
onStop,
|
||||
onTranscribeAudio,
|
||||
runStartedAt = null,
|
||||
goalState,
|
||||
workspaceScope = null,
|
||||
@ -685,7 +785,9 @@ export function ThreadComposer({
|
||||
const wasStreamingRef = useRef(isStreaming);
|
||||
const skipNextQueuedFlushRef = useRef(false);
|
||||
const skipQueuedPromptPersistRef = useRef(false);
|
||||
const voiceShortcutDownRef = useRef(false);
|
||||
const isHero = variant === "hero";
|
||||
const voiceShortcutLabel = useMemo(getVoiceShortcutLabel, []);
|
||||
const queuedPromptStorageKey = useMemo(
|
||||
() => queuedPromptsStorageKey(pendingQueueKey),
|
||||
[pendingQueueKey],
|
||||
@ -1026,6 +1128,65 @@ export function ThreadComposer({
|
||||
});
|
||||
}, []);
|
||||
|
||||
const appendTranscription = useCallback((text: string) => {
|
||||
const transcript = text.trim();
|
||||
if (!transcript) return;
|
||||
setValue((current) => {
|
||||
if (!current.trim()) return transcript;
|
||||
const separator = /[\s\n]$/.test(current) ? "" : " ";
|
||||
return `${current}${separator}${transcript}`;
|
||||
});
|
||||
setSlashMenuDismissed(false);
|
||||
setCliAppMenuDismissed(false);
|
||||
setInlineError(null);
|
||||
resizeTextarea();
|
||||
}, [resizeTextarea]);
|
||||
|
||||
const clearInlineError = useCallback(() => setInlineError(null), []);
|
||||
const setVoiceError = useCallback((key: VoiceRecorderErrorKey) => {
|
||||
setInlineError(t(`thread.composer.voiceErrors.${key}`));
|
||||
}, [t]);
|
||||
const voiceRecorder = useVoiceRecorder({
|
||||
disabled,
|
||||
onClearError: clearInlineError,
|
||||
onError: setVoiceError,
|
||||
onTranscript: appendTranscription,
|
||||
onTranscribeAudio,
|
||||
});
|
||||
|
||||
useEffect(() => {
|
||||
if (!onTranscribeAudio) return;
|
||||
|
||||
function onKeyDown(event: KeyboardEvent): void {
|
||||
if (!isVoiceShortcutDown(event) || event.repeat || voiceShortcutDownRef.current) return;
|
||||
event.preventDefault();
|
||||
voiceShortcutDownRef.current = true;
|
||||
voiceRecorder.beginShortcutHold();
|
||||
}
|
||||
|
||||
function onKeyUp(event: KeyboardEvent): void {
|
||||
if (!voiceShortcutDownRef.current || !isVoiceShortcutRelease(event)) return;
|
||||
event.preventDefault();
|
||||
voiceShortcutDownRef.current = false;
|
||||
voiceRecorder.endShortcutHold();
|
||||
}
|
||||
|
||||
function onWindowBlur(): void {
|
||||
if (!voiceShortcutDownRef.current) return;
|
||||
voiceShortcutDownRef.current = false;
|
||||
voiceRecorder.endShortcutHold();
|
||||
}
|
||||
|
||||
window.addEventListener("keydown", onKeyDown);
|
||||
window.addEventListener("keyup", onKeyUp);
|
||||
window.addEventListener("blur", onWindowBlur);
|
||||
return () => {
|
||||
window.removeEventListener("keydown", onKeyDown);
|
||||
window.removeEventListener("keyup", onKeyUp);
|
||||
window.removeEventListener("blur", onWindowBlur);
|
||||
};
|
||||
}, [onTranscribeAudio, voiceRecorder.beginShortcutHold, voiceRecorder.endShortcutHold]);
|
||||
|
||||
const chooseSlashCommand = useCallback(
|
||||
(command: SlashCommand) => {
|
||||
if (command.command === "/stop" && isStreaming && onStop) {
|
||||
@ -1341,6 +1502,23 @@ export function ThreadComposer({
|
||||
);
|
||||
|
||||
const attachButtonDisabled = disabled || full;
|
||||
const showVoiceButton = Boolean(onTranscribeAudio);
|
||||
const voiceRecordingStatusLabel = t("thread.composer.voice.recordingStatus", {
|
||||
time: voiceRecorder.elapsedLabel,
|
||||
defaultValue: `Recording ${voiceRecorder.elapsedLabel}`,
|
||||
});
|
||||
const voiceButtonLabel =
|
||||
voiceRecorder.state === "recording"
|
||||
? t("thread.composer.voice.stop")
|
||||
: voiceRecorder.state === "transcribing"
|
||||
? t("thread.composer.voice.transcribing")
|
||||
: t("thread.composer.tools.voice");
|
||||
const voiceButtonTooltip =
|
||||
voiceRecorder.state === "recording"
|
||||
? t("thread.composer.voice.stop")
|
||||
: voiceRecorder.state === "transcribing"
|
||||
? t("thread.composer.voice.transcribing")
|
||||
: t("thread.composer.voice.hint");
|
||||
const showStopButton = isStreaming && !!onStop;
|
||||
const relaxedHeroInput = isHero && images.length === 0 && !isStreaming;
|
||||
const inputTextClasses = cn(
|
||||
@ -1531,7 +1709,15 @@ export function ThreadComposer({
|
||||
>
|
||||
<Plus className={cn(isHero ? "h-[18px] w-[18px]" : "h-4 w-4")} />
|
||||
</Button>
|
||||
{workspaceScope ? (
|
||||
{voiceRecorder.isRecording ? (
|
||||
<VoiceRecordingMeter
|
||||
ariaLabel={voiceRecordingStatusLabel}
|
||||
className="mx-1 flex-1"
|
||||
elapsedLabel={voiceRecorder.elapsedLabel}
|
||||
isHero={isHero}
|
||||
levels={voiceRecorder.levels}
|
||||
/>
|
||||
) : workspaceScope ? (
|
||||
<WorkspaceAccessMenu
|
||||
scope={workspaceScope}
|
||||
disabled={disabled || workspaceScopeDisabled}
|
||||
@ -1542,7 +1728,7 @@ export function ThreadComposer({
|
||||
) : null}
|
||||
</div>
|
||||
<div className={cn("flex shrink-0 items-center", isHero ? "gap-1.5" : "gap-2")}>
|
||||
{modelLabel ? (
|
||||
{modelLabel && !voiceRecorder.isRecording ? (
|
||||
<ComposerModelBadge
|
||||
label={modelLabel}
|
||||
provider={modelProvider}
|
||||
@ -1552,6 +1738,53 @@ export function ThreadComposer({
|
||||
onClick={modelNeedsSetup ? onModelBadgeClick : undefined}
|
||||
/>
|
||||
) : null}
|
||||
{showVoiceButton ? (
|
||||
<TooltipProvider delayDuration={220} skipDelayDuration={80}>
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>
|
||||
<Button
|
||||
type="button"
|
||||
size="icon"
|
||||
variant="ghost"
|
||||
disabled={voiceRecorder.buttonDisabled}
|
||||
aria-label={voiceButtonLabel}
|
||||
aria-keyshortcuts={VOICE_SHORTCUT_ARIA}
|
||||
title={voiceButtonTooltip}
|
||||
onPointerDown={voiceRecorder.beginPress}
|
||||
onPointerUp={voiceRecorder.endPress}
|
||||
onPointerCancel={voiceRecorder.endPress}
|
||||
onClick={voiceRecorder.handleClick}
|
||||
className={cn(
|
||||
"rounded-full border border-transparent text-muted-foreground hover:bg-muted/65 hover:text-foreground",
|
||||
isHero ? "h-8 w-8" : "h-9 w-9",
|
||||
voiceRecorder.isRecording &&
|
||||
"bg-red-500 text-white shadow-[0_8px_20px_rgba(239,68,68,0.22)] hover:bg-red-500 hover:text-white",
|
||||
)}
|
||||
>
|
||||
{voiceRecorder.state === "transcribing" ? (
|
||||
<Loader2 className={cn(isHero ? "h-4 w-4" : "h-4 w-4", "animate-spin")} />
|
||||
) : voiceRecorder.isRecording ? (
|
||||
<Square className={cn(isHero ? "h-3.5 w-3.5" : "h-3.5 w-3.5")} fill="currentColor" />
|
||||
) : (
|
||||
<Mic className={cn(isHero ? "h-4 w-4" : "h-4 w-4")} />
|
||||
)}
|
||||
</Button>
|
||||
</TooltipTrigger>
|
||||
<TooltipContent
|
||||
side="top"
|
||||
align="center"
|
||||
className="flex items-center gap-2 rounded-full border border-border/70 bg-background px-3 py-1.5 text-[13px] font-medium text-foreground shadow-[0_8px_24px_rgba(15,23,42,0.13)] dark:border-white/10 dark:bg-neutral-900 dark:text-white"
|
||||
>
|
||||
<span>{voiceButtonTooltip}</span>
|
||||
{voiceRecorder.state === "idle" ? (
|
||||
<kbd className="rounded-full bg-muted px-2 py-0.5 font-sans text-[12px] font-semibold leading-none text-muted-foreground dark:bg-white/10 dark:text-white/80">
|
||||
{voiceShortcutLabel}
|
||||
</kbd>
|
||||
) : null}
|
||||
</TooltipContent>
|
||||
</Tooltip>
|
||||
</TooltipProvider>
|
||||
) : null}
|
||||
<Button
|
||||
type={showStopButton || modelNeedsSetup ? "button" : "submit"}
|
||||
size="icon"
|
||||
|
||||
@ -302,6 +302,7 @@ export function ThreadShell({
|
||||
runStartedAt,
|
||||
goalState,
|
||||
send,
|
||||
transcribeAudio,
|
||||
stop,
|
||||
setMessages,
|
||||
streamError,
|
||||
@ -642,6 +643,7 @@ export function ThreadShell({
|
||||
cliApps={cliApps}
|
||||
mcpPresets={mcpPresets}
|
||||
onStop={stop}
|
||||
onTranscribeAudio={transcribeAudio}
|
||||
runStartedAt={runStartedAt}
|
||||
goalState={goalState}
|
||||
workspaceScope={workspaceScope}
|
||||
@ -672,6 +674,7 @@ export function ThreadShell({
|
||||
cliApps={cliApps}
|
||||
mcpPresets={mcpPresets}
|
||||
runStartedAt={runStartedAt}
|
||||
onTranscribeAudio={transcribeAudio}
|
||||
goalState={goalState}
|
||||
workspaceScope={workspaceScope}
|
||||
workspaceDefaultScope={workspaceDefaultScope}
|
||||
|
||||
@ -438,6 +438,7 @@ export function useNanobotStream(
|
||||
/** Latest sustained goal for this ``chatId`` (``goal_state`` WS events). */
|
||||
goalState: GoalStateWsPayload | undefined;
|
||||
send: (content: string, images?: SendImage[], options?: SendOptions) => void;
|
||||
transcribeAudio: (dataUrl: string, options?: { durationMs?: number }) => Promise<string>;
|
||||
stop: () => void;
|
||||
setMessages: React.Dispatch<React.SetStateAction<UIMessage[]>>;
|
||||
/** Latest transport-level fault raised since the last ``dismissStreamError``.
|
||||
@ -1089,12 +1090,19 @@ export function useNanobotStream(
|
||||
client.sendMessage(chatId, "/stop");
|
||||
}, [chatId, clearActivitySegment, client, flushPendingStreamEvents]);
|
||||
|
||||
const transcribeAudio = useCallback(
|
||||
(dataUrl: string, options?: { durationMs?: number }) =>
|
||||
client.transcribeAudio(dataUrl, options),
|
||||
[client],
|
||||
);
|
||||
|
||||
return {
|
||||
messages,
|
||||
isStreaming,
|
||||
runStartedAt,
|
||||
goalState,
|
||||
send,
|
||||
transcribeAudio,
|
||||
stop,
|
||||
setMessages,
|
||||
streamError,
|
||||
|
||||
422
webui/src/hooks/useVoiceRecorder.ts
Normal file
422
webui/src/hooks/useVoiceRecorder.ts
Normal file
@ -0,0 +1,422 @@
|
||||
import {
|
||||
useCallback,
|
||||
useEffect,
|
||||
useRef,
|
||||
useState,
|
||||
type PointerEvent as ReactPointerEvent,
|
||||
} from "react";
|
||||
|
||||
const VOICE_RECORDING_MAX_MS = 120_000;
|
||||
const VOICE_RECORDING_MIN_MS = 650;
|
||||
const VOICE_NO_INPUT_HINT_MS = 1_100;
|
||||
const VOICE_HOLD_START_MS = 140;
|
||||
const VOICE_WAVEFORM_BAR_COUNT = 64;
|
||||
const VOICE_WAVEFORM_SILENT_HEIGHT = 3;
|
||||
const VOICE_WAVEFORM_MIN_HEIGHT = 7;
|
||||
const VOICE_WAVEFORM_MAX_HEIGHT = 34;
|
||||
const VOICE_MIN_LEVEL = 0.018;
|
||||
const VOICE_WAVEFORM_IDLE_LEVELS = Array.from(
|
||||
{ length: VOICE_WAVEFORM_BAR_COUNT },
|
||||
() => VOICE_WAVEFORM_SILENT_HEIGHT,
|
||||
);
|
||||
const VOICE_MIME_CANDIDATES = [
|
||||
"audio/webm;codecs=opus",
|
||||
"audio/webm",
|
||||
"audio/mp4",
|
||||
"audio/ogg;codecs=opus",
|
||||
] as const;
|
||||
|
||||
export type VoiceRecorderState = "idle" | "recording" | "transcribing";
|
||||
export type VoiceRecorderErrorKey =
|
||||
| "failed"
|
||||
| "noInput"
|
||||
| "notConfigured"
|
||||
| "permission"
|
||||
| "tooLong"
|
||||
| "tooShort"
|
||||
| "unsupported";
|
||||
|
||||
interface VoiceRecorderOptions {
|
||||
disabled?: boolean;
|
||||
onClearError: () => void;
|
||||
onError: (key: VoiceRecorderErrorKey) => void;
|
||||
onTranscript: (text: string) => void;
|
||||
onTranscribeAudio?: (dataUrl: string, options?: { durationMs?: number }) => Promise<string>;
|
||||
}
|
||||
|
||||
export function useVoiceRecorder({
|
||||
disabled,
|
||||
onClearError,
|
||||
onError,
|
||||
onTranscript,
|
||||
onTranscribeAudio,
|
||||
}: VoiceRecorderOptions) {
|
||||
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
|
||||
const chunksRef = useRef<BlobPart[]>([]);
|
||||
const streamRef = useRef<MediaStream | null>(null);
|
||||
const audioRef = useRef<VoiceAudioState | null>(null);
|
||||
const startedAtRef = useRef(0);
|
||||
const maxTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
|
||||
const inputHintTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
|
||||
const holdTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
|
||||
const holdActiveRef = useRef(false);
|
||||
const startPendingRef = useRef(false);
|
||||
const stopAfterStartRef = useRef(false);
|
||||
const suppressClickRef = useRef(false);
|
||||
const suppressClickTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
|
||||
const shortcutActiveRef = useRef(false);
|
||||
const levelObservedRef = useRef(false);
|
||||
const peakLevelRef = useRef(0);
|
||||
const levelReliableRef = useRef(false);
|
||||
const noInputHintVisibleRef = useRef(false);
|
||||
const [state, setState] = useState<VoiceRecorderState>("idle");
|
||||
const [elapsedMs, setElapsedMs] = useState(0);
|
||||
const [levels, setLevels] = useState<number[]>(VOICE_WAVEFORM_IDLE_LEVELS);
|
||||
|
||||
const clearInputHintTimer = useCallback(() => clearTimer(inputHintTimerRef), []);
|
||||
const clearSuppressClickTimer = useCallback(() => clearTimer(suppressClickTimerRef), []);
|
||||
|
||||
const suppressNextClick = useCallback(() => {
|
||||
clearSuppressClickTimer();
|
||||
suppressClickRef.current = true;
|
||||
suppressClickTimerRef.current = setTimeout(() => {
|
||||
suppressClickRef.current = false;
|
||||
suppressClickTimerRef.current = null;
|
||||
}, 500);
|
||||
}, [clearSuppressClickTimer]);
|
||||
|
||||
const stopWaveform = useCallback(() => {
|
||||
const audio = audioRef.current;
|
||||
audioRef.current = null;
|
||||
if (!audio) return;
|
||||
if (audio.frame !== null) cancelAnimationFrame(audio.frame);
|
||||
audio.source.disconnect();
|
||||
audio.analyser.disconnect();
|
||||
void audio.context.close().catch(() => undefined);
|
||||
}, []);
|
||||
|
||||
const startWaveform = useCallback((stream: MediaStream) => {
|
||||
const AudioContextCtor = audioContextConstructor();
|
||||
if (!AudioContextCtor) return;
|
||||
stopWaveform();
|
||||
setLevels(VOICE_WAVEFORM_IDLE_LEVELS);
|
||||
try {
|
||||
const context = new AudioContextCtor();
|
||||
const source = context.createMediaStreamSource(stream);
|
||||
const analyser = context.createAnalyser();
|
||||
analyser.fftSize = 256;
|
||||
analyser.smoothingTimeConstant = 0.68;
|
||||
source.connect(analyser);
|
||||
const audio: VoiceAudioState = {
|
||||
analyser,
|
||||
context,
|
||||
data: new Uint8Array(analyser.fftSize),
|
||||
frame: null,
|
||||
source,
|
||||
};
|
||||
const tick = () => {
|
||||
const current = audioRef.current;
|
||||
if (!current) return;
|
||||
if (current.context.state !== "running") {
|
||||
void current.context.resume().catch(() => undefined);
|
||||
current.frame = requestAnimationFrame(tick);
|
||||
return;
|
||||
}
|
||||
current.analyser.getByteTimeDomainData(current.data);
|
||||
const level = voiceLevelFromSamples(current.data);
|
||||
levelReliableRef.current = true;
|
||||
levelObservedRef.current = true;
|
||||
peakLevelRef.current = Math.max(peakLevelRef.current, level);
|
||||
if (level >= VOICE_MIN_LEVEL) {
|
||||
clearInputHintTimer();
|
||||
if (noInputHintVisibleRef.current) {
|
||||
noInputHintVisibleRef.current = false;
|
||||
onClearError();
|
||||
}
|
||||
}
|
||||
setLevels((currentLevels) => [
|
||||
...currentLevels.slice(1),
|
||||
waveformHeightFromLevel(level),
|
||||
]);
|
||||
current.frame = requestAnimationFrame(tick);
|
||||
};
|
||||
audioRef.current = audio;
|
||||
void context.resume().catch(() => undefined);
|
||||
audio.frame = requestAnimationFrame(tick);
|
||||
} catch {
|
||||
stopWaveform();
|
||||
}
|
||||
}, [clearInputHintTimer, onClearError, stopWaveform]);
|
||||
|
||||
const cleanupRecording = useCallback(() => {
|
||||
clearTimer(holdTimerRef);
|
||||
clearInputHintTimer();
|
||||
clearTimer(maxTimerRef);
|
||||
stopWaveform();
|
||||
streamRef.current?.getTracks().forEach((track) => track.stop());
|
||||
streamRef.current = null;
|
||||
mediaRecorderRef.current = null;
|
||||
startPendingRef.current = false;
|
||||
shortcutActiveRef.current = false;
|
||||
noInputHintVisibleRef.current = false;
|
||||
}, [clearInputHintTimer, stopWaveform]);
|
||||
|
||||
const stopRecording = useCallback(() => {
|
||||
const recorder = mediaRecorderRef.current;
|
||||
if (!recorder || recorder.state === "inactive") return;
|
||||
recorder.stop();
|
||||
}, []);
|
||||
|
||||
const stopRecordingWhenReady = useCallback(() => {
|
||||
const recorder = mediaRecorderRef.current;
|
||||
if (recorder && recorder.state !== "inactive") {
|
||||
stopRecording();
|
||||
} else if (startPendingRef.current) {
|
||||
stopAfterStartRef.current = true;
|
||||
}
|
||||
}, [stopRecording]);
|
||||
|
||||
const startRecording = useCallback(async () => {
|
||||
if (!onTranscribeAudio || state !== "idle" || startPendingRef.current) return;
|
||||
if (!navigator.mediaDevices?.getUserMedia || typeof MediaRecorder === "undefined") {
|
||||
onError("unsupported");
|
||||
return;
|
||||
}
|
||||
startPendingRef.current = true;
|
||||
try {
|
||||
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
const recorder = new MediaRecorder(stream, mediaRecorderOptions());
|
||||
chunksRef.current = [];
|
||||
streamRef.current = stream;
|
||||
mediaRecorderRef.current = recorder;
|
||||
startedAtRef.current = Date.now();
|
||||
levelObservedRef.current = false;
|
||||
peakLevelRef.current = 0;
|
||||
levelReliableRef.current = false;
|
||||
noInputHintVisibleRef.current = false;
|
||||
setElapsedMs(0);
|
||||
startWaveform(stream);
|
||||
recorder.ondataavailable = (event) => {
|
||||
if (event.data.size > 0) chunksRef.current.push(event.data);
|
||||
};
|
||||
recorder.onstop = () => {
|
||||
const chunks = chunksRef.current.splice(0);
|
||||
const durationMs = Math.max(0, Date.now() - startedAtRef.current);
|
||||
const mimeType = recorder.mimeType || "audio/webm";
|
||||
const hasMeasuredSilence =
|
||||
levelReliableRef.current
|
||||
&& levelObservedRef.current
|
||||
&& peakLevelRef.current < VOICE_MIN_LEVEL;
|
||||
cleanupRecording();
|
||||
if (chunks.length === 0) {
|
||||
setState("idle");
|
||||
return;
|
||||
}
|
||||
if (durationMs < VOICE_RECORDING_MIN_MS) {
|
||||
setState("idle");
|
||||
onError("tooShort");
|
||||
return;
|
||||
}
|
||||
if (hasMeasuredSilence) {
|
||||
setState("idle");
|
||||
onError("noInput");
|
||||
return;
|
||||
}
|
||||
setState("transcribing");
|
||||
void blobToDataUrl(new Blob(chunks, { type: mimeType }))
|
||||
.then((dataUrl) => onTranscribeAudio(dataUrl, { durationMs }))
|
||||
.then(onTranscript)
|
||||
.catch((error) => onError(transcriptionErrorKey(error)))
|
||||
.finally(() => setState("idle"));
|
||||
};
|
||||
recorder.start();
|
||||
setState("recording");
|
||||
onClearError();
|
||||
maxTimerRef.current = setTimeout(stopRecording, VOICE_RECORDING_MAX_MS);
|
||||
inputHintTimerRef.current = setTimeout(() => {
|
||||
const recording = mediaRecorderRef.current?.state === "recording";
|
||||
if (
|
||||
!recording
|
||||
|| !levelReliableRef.current
|
||||
|| !levelObservedRef.current
|
||||
|| peakLevelRef.current >= VOICE_MIN_LEVEL
|
||||
) {
|
||||
return;
|
||||
}
|
||||
noInputHintVisibleRef.current = true;
|
||||
onError("noInput");
|
||||
}, VOICE_NO_INPUT_HINT_MS);
|
||||
} catch {
|
||||
cleanupRecording();
|
||||
setState("idle");
|
||||
onError("permission");
|
||||
}
|
||||
}, [
|
||||
cleanupRecording,
|
||||
onClearError,
|
||||
onError,
|
||||
onTranscribeAudio,
|
||||
onTranscript,
|
||||
startWaveform,
|
||||
state,
|
||||
stopRecording,
|
||||
]);
|
||||
|
||||
const startRecordingWithDeferredStop = useCallback(() => {
|
||||
stopAfterStartRef.current = false;
|
||||
void startRecording().then(() => {
|
||||
if (!stopAfterStartRef.current) return;
|
||||
stopAfterStartRef.current = false;
|
||||
stopRecording();
|
||||
});
|
||||
}, [startRecording, stopRecording]);
|
||||
|
||||
const beginPress = useCallback((event: ReactPointerEvent<HTMLButtonElement>) => {
|
||||
if (event.pointerType === "mouse" && event.button !== 0) return;
|
||||
if (!onTranscribeAudio || disabled || state !== "idle") return;
|
||||
clearTimer(holdTimerRef);
|
||||
try {
|
||||
event.currentTarget.setPointerCapture(event.pointerId);
|
||||
} catch {
|
||||
// Some embedded runtimes do not expose pointer capture for toolbar buttons.
|
||||
}
|
||||
holdTimerRef.current = setTimeout(() => {
|
||||
holdTimerRef.current = null;
|
||||
holdActiveRef.current = true;
|
||||
suppressNextClick();
|
||||
startRecordingWithDeferredStop();
|
||||
}, VOICE_HOLD_START_MS);
|
||||
}, [disabled, onTranscribeAudio, startRecordingWithDeferredStop, state, suppressNextClick]);
|
||||
|
||||
const endPress = useCallback(() => {
|
||||
const wasHoldRecording = holdActiveRef.current;
|
||||
clearTimer(holdTimerRef);
|
||||
if (!wasHoldRecording) return;
|
||||
holdActiveRef.current = false;
|
||||
suppressNextClick();
|
||||
stopRecordingWhenReady();
|
||||
}, [stopRecordingWhenReady, suppressNextClick]);
|
||||
|
||||
const handleClick = useCallback(() => {
|
||||
if (suppressClickRef.current) {
|
||||
clearSuppressClickTimer();
|
||||
suppressClickRef.current = false;
|
||||
return;
|
||||
}
|
||||
if (state === "recording") stopRecording();
|
||||
else void startRecording();
|
||||
}, [clearSuppressClickTimer, startRecording, state, stopRecording]);
|
||||
|
||||
const beginShortcutHold = useCallback(() => {
|
||||
if (!onTranscribeAudio || disabled || state !== "idle" || shortcutActiveRef.current) return;
|
||||
shortcutActiveRef.current = true;
|
||||
startRecordingWithDeferredStop();
|
||||
}, [disabled, onTranscribeAudio, startRecordingWithDeferredStop, state]);
|
||||
|
||||
const endShortcutHold = useCallback(() => {
|
||||
if (!shortcutActiveRef.current) return;
|
||||
shortcutActiveRef.current = false;
|
||||
stopRecordingWhenReady();
|
||||
}, [stopRecordingWhenReady]);
|
||||
|
||||
useEffect(() => {
|
||||
if (state !== "recording") {
|
||||
setElapsedMs(0);
|
||||
return;
|
||||
}
|
||||
const updateElapsed = () => {
|
||||
setElapsedMs(Math.max(0, Date.now() - startedAtRef.current));
|
||||
};
|
||||
updateElapsed();
|
||||
const interval = window.setInterval(updateElapsed, 250);
|
||||
return () => window.clearInterval(interval);
|
||||
}, [state]);
|
||||
|
||||
useEffect(() => cleanupRecording, [cleanupRecording]);
|
||||
useEffect(() => () => clearSuppressClickTimer(), [clearSuppressClickTimer]);
|
||||
|
||||
return {
|
||||
beginShortcutHold,
|
||||
beginPress,
|
||||
buttonDisabled: disabled || state === "transcribing",
|
||||
elapsedLabel: formatVoiceElapsed(elapsedMs),
|
||||
endShortcutHold,
|
||||
endPress,
|
||||
handleClick,
|
||||
isRecording: state === "recording",
|
||||
levels,
|
||||
state,
|
||||
};
|
||||
}
|
||||
|
||||
interface VoiceAudioState {
|
||||
analyser: AnalyserNode;
|
||||
context: AudioContext;
|
||||
data: Uint8Array<ArrayBuffer>;
|
||||
frame: number | null;
|
||||
source: MediaStreamAudioSourceNode;
|
||||
}
|
||||
|
||||
function clearTimer(ref: { current: ReturnType<typeof setTimeout> | null }) {
|
||||
if (ref.current !== null) {
|
||||
clearTimeout(ref.current);
|
||||
ref.current = null;
|
||||
}
|
||||
}
|
||||
|
||||
function mediaRecorderOptions(): MediaRecorderOptions | undefined {
|
||||
if (typeof MediaRecorder === "undefined") return undefined;
|
||||
const mimeType = VOICE_MIME_CANDIDATES.find((type) => MediaRecorder.isTypeSupported(type));
|
||||
return mimeType ? { mimeType } : undefined;
|
||||
}
|
||||
|
||||
function formatVoiceElapsed(ms: number): string {
|
||||
const seconds = Math.max(0, Math.floor(ms / 1000));
|
||||
const minutes = Math.floor(seconds / 60);
|
||||
return `${minutes}:${String(seconds % 60).padStart(2, "0")}`;
|
||||
}
|
||||
|
||||
function audioContextConstructor(): typeof AudioContext | undefined {
|
||||
if (typeof window === "undefined") return undefined;
|
||||
return window.AudioContext
|
||||
?? (window as unknown as { webkitAudioContext?: typeof AudioContext }).webkitAudioContext;
|
||||
}
|
||||
|
||||
function voiceLevelFromSamples(samples: ArrayLike<number>): number {
|
||||
if (samples.length === 0) return 0;
|
||||
let sum = 0;
|
||||
for (let index = 0; index < samples.length; index += 1) {
|
||||
const centered = (samples[index] - 128) / 128;
|
||||
sum += centered * centered;
|
||||
}
|
||||
const rms = Math.sqrt(sum / samples.length);
|
||||
return Math.min(1, Math.pow(rms * 4.2, 0.72));
|
||||
}
|
||||
|
||||
function waveformHeightFromLevel(level: number): number {
|
||||
if (level < VOICE_MIN_LEVEL) return VOICE_WAVEFORM_SILENT_HEIGHT;
|
||||
const activeLevel = Math.min(1, (level - VOICE_MIN_LEVEL) / (1 - VOICE_MIN_LEVEL));
|
||||
return Math.round(
|
||||
VOICE_WAVEFORM_MIN_HEIGHT
|
||||
+ activeLevel * (VOICE_WAVEFORM_MAX_HEIGHT - VOICE_WAVEFORM_MIN_HEIGHT),
|
||||
);
|
||||
}
|
||||
|
||||
function blobToDataUrl(blob: Blob): Promise<string> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const reader = new FileReader();
|
||||
reader.onload = () => {
|
||||
if (typeof reader.result === "string") resolve(reader.result);
|
||||
else reject(new Error("invalid_data_url"));
|
||||
};
|
||||
reader.onerror = () => reject(reader.error ?? new Error("read_failed"));
|
||||
reader.readAsDataURL(blob);
|
||||
});
|
||||
}
|
||||
|
||||
function transcriptionErrorKey(error: unknown): VoiceRecorderErrorKey {
|
||||
const detail = error instanceof Error ? error.message : "";
|
||||
if (detail === "not_configured") return "notConfigured";
|
||||
if (detail === "duration") return "tooLong";
|
||||
return "failed";
|
||||
}
|
||||
@ -73,6 +73,7 @@
|
||||
"models": "Models",
|
||||
"providers": "Providers",
|
||||
"image": "Image",
|
||||
"voice": "Voice",
|
||||
"browser": "Web",
|
||||
"cliApps": "CLI Apps",
|
||||
"mcp": "MCP",
|
||||
@ -99,7 +100,8 @@
|
||||
"capabilities": "Capabilities",
|
||||
"apps": "Apps",
|
||||
"nativeHost": "Native host",
|
||||
"hostSafety": "App safety"
|
||||
"hostSafety": "App safety",
|
||||
"voiceInput": "Voice input"
|
||||
},
|
||||
"models": {
|
||||
"selectModel": "Select model",
|
||||
@ -161,7 +163,13 @@
|
||||
"engine": "Engine",
|
||||
"logs": "Logs",
|
||||
"diagnostics": "Diagnostics",
|
||||
"contextWindow": "Context window"
|
||||
"contextWindow": "Context window",
|
||||
"transcription": "Transcription",
|
||||
"transcriptionProvider": "Provider",
|
||||
"transcriptionProviderStatus": "Provider status",
|
||||
"transcriptionModel": "Model",
|
||||
"transcriptionLanguage": "Language",
|
||||
"voiceLimits": "Limits"
|
||||
},
|
||||
"help": {
|
||||
"theme": "Switch between light and dark appearance.",
|
||||
@ -200,7 +208,12 @@
|
||||
"diagnostics": "Export a small runtime report for support.",
|
||||
"localServiceAccessNative": "Allow Full Access shell commands to reach services on this Mac.",
|
||||
"webuiDefaultAccessNative": "Used by native chats without a project-specific permission.",
|
||||
"contextWindow": "Choose the default context budget for this model configuration."
|
||||
"contextWindow": "Choose the default context budget for this model configuration.",
|
||||
"transcription": "Transcribe microphone input before sending it. Chat channel voice messages use the same settings.",
|
||||
"transcriptionProvider": "Uses the matching provider credentials from Providers.",
|
||||
"transcriptionProviderStatus": "API keys stay under providers, not in transcription settings.",
|
||||
"transcriptionModel": "Leave as the resolved default unless your provider needs a custom model id.",
|
||||
"transcriptionLanguage": "Optional ISO-639 hint such as en, zh, ja, or ko."
|
||||
},
|
||||
"timezone": {
|
||||
"select": "Select timezone",
|
||||
@ -391,6 +404,7 @@
|
||||
"totalProviders": "{{count}} available",
|
||||
"webSearch": "Web search",
|
||||
"imageGeneration": "Image generation",
|
||||
"voiceInput": "Voice input",
|
||||
"workspace": "Workspace"
|
||||
},
|
||||
"usage": {
|
||||
@ -486,6 +500,11 @@
|
||||
"rawInstructions": "Raw SKILL.md",
|
||||
"rawInstructionsEmpty": "No raw instructions.",
|
||||
"detailDescription": "Details for {{name}}."
|
||||
},
|
||||
"voice": {
|
||||
"selectProvider": "Select provider",
|
||||
"configureProvider": "Configure provider",
|
||||
"languageAuto": "Auto"
|
||||
}
|
||||
},
|
||||
"chat": {
|
||||
@ -678,6 +697,21 @@
|
||||
"deepResearch": "Deep research",
|
||||
"voice": "Voice input"
|
||||
},
|
||||
"voice": {
|
||||
"hint": "Click to dictate or hold",
|
||||
"stop": "Stop recording",
|
||||
"transcribing": "Transcribing...",
|
||||
"recordingStatus": "Recording {{time}}"
|
||||
},
|
||||
"voiceErrors": {
|
||||
"unsupported": "Voice input is not supported in this browser.",
|
||||
"permission": "Microphone permission is required.",
|
||||
"notConfigured": "Configure a transcription provider first.",
|
||||
"tooLong": "Recording is too long.",
|
||||
"tooShort": "Hold a little longer to record voice.",
|
||||
"noInput": "No microphone input detected.",
|
||||
"failed": "Could not transcribe audio."
|
||||
},
|
||||
"slash": {
|
||||
"ariaLabel": "Slash commands",
|
||||
"label": "commands",
|
||||
|
||||
@ -73,6 +73,7 @@
|
||||
"models": "Modelos",
|
||||
"providers": "Proveedores",
|
||||
"image": "Imagen",
|
||||
"voice": "Voz",
|
||||
"browser": "Internet",
|
||||
"runtime": "Sistema",
|
||||
"advanced": "Seguridad",
|
||||
@ -99,7 +100,8 @@
|
||||
"mcp": "Servicios MCP",
|
||||
"apps": "Aplicaciones",
|
||||
"nativeHost": "Host nativo",
|
||||
"hostSafety": "Seguridad de la app"
|
||||
"hostSafety": "Seguridad de la app",
|
||||
"voiceInput": "Entrada de voz"
|
||||
},
|
||||
"rows": {
|
||||
"theme": "Tema",
|
||||
@ -142,7 +144,13 @@
|
||||
"engine": "Motor",
|
||||
"logs": "Registros",
|
||||
"diagnostics": "Diagnóstico",
|
||||
"contextWindow": "Ventana de contexto"
|
||||
"contextWindow": "Ventana de contexto",
|
||||
"transcription": "Transcripcion",
|
||||
"transcriptionProvider": "Proveedor",
|
||||
"transcriptionProviderStatus": "Estado del proveedor",
|
||||
"transcriptionModel": "Modelo",
|
||||
"transcriptionLanguage": "Idioma",
|
||||
"voiceLimits": "Limites"
|
||||
},
|
||||
"help": {
|
||||
"theme": "Cambia entre apariencia clara y oscura.",
|
||||
@ -181,7 +189,12 @@
|
||||
"diagnostics": "Exporta un pequeño informe de runtime para soporte.",
|
||||
"localServiceAccessNative": "Permite que comandos shell con Full Access alcancen servicios en este Mac.",
|
||||
"webuiDefaultAccessNative": "Usado por chats nativos sin permiso específico de proyecto.",
|
||||
"contextWindow": "Elige el presupuesto de contexto predeterminado para esta configuración de modelo."
|
||||
"contextWindow": "Elige el presupuesto de contexto predeterminado para esta configuración de modelo.",
|
||||
"transcription": "Transcribe la entrada del microfono antes de enviarla. Los mensajes de voz de los canales usan la misma configuracion.",
|
||||
"transcriptionProvider": "Usa las credenciales del proveedor correspondiente en Proveedores.",
|
||||
"transcriptionProviderStatus": "Las claves API permanecen en proveedores, no en la configuracion de transcripcion.",
|
||||
"transcriptionModel": "Dejalo como el valor predeterminado resuelto salvo que el proveedor necesite un id de modelo personalizado.",
|
||||
"transcriptionLanguage": "Pista ISO-639 opcional, como en, zh, ja o ko."
|
||||
},
|
||||
"values": {
|
||||
"light": "Claro",
|
||||
@ -283,6 +296,7 @@
|
||||
"totalProviders": "{{count}} disponibles",
|
||||
"webSearch": "Búsqueda web",
|
||||
"imageGeneration": "Generación de imágenes",
|
||||
"voiceInput": "Entrada de voz",
|
||||
"workspace": "Espacio de trabajo"
|
||||
},
|
||||
"usage": {
|
||||
@ -486,6 +500,11 @@
|
||||
"rawInstructions": "SKILL.md original",
|
||||
"rawInstructionsEmpty": "No hay instrucciones originales.",
|
||||
"detailDescription": "Detalles de {{name}}."
|
||||
},
|
||||
"voice": {
|
||||
"selectProvider": "Seleccionar proveedor",
|
||||
"configureProvider": "Configurar proveedor",
|
||||
"languageAuto": "Auto"
|
||||
}
|
||||
},
|
||||
"chat": {
|
||||
@ -678,6 +697,21 @@
|
||||
"deepResearch": "Investigación profunda",
|
||||
"voice": "Entrada de voz"
|
||||
},
|
||||
"voice": {
|
||||
"hint": "Haz clic para dictar o mantén",
|
||||
"stop": "Detener grabación",
|
||||
"transcribing": "Transcribiendo...",
|
||||
"recordingStatus": "Grabando {{time}}"
|
||||
},
|
||||
"voiceErrors": {
|
||||
"unsupported": "Este navegador no admite entrada de voz.",
|
||||
"permission": "Se requiere permiso de micrófono.",
|
||||
"notConfigured": "Configura primero un proveedor de transcripción.",
|
||||
"tooLong": "La grabación es demasiado larga.",
|
||||
"tooShort": "Mantén pulsado un poco más para grabar voz.",
|
||||
"noInput": "No se detectó entrada del micrófono.",
|
||||
"failed": "No se pudo transcribir el audio."
|
||||
},
|
||||
"slash": {
|
||||
"ariaLabel": "Comandos slash",
|
||||
"label": "comandos",
|
||||
|
||||
@ -73,6 +73,7 @@
|
||||
"models": "Modèles",
|
||||
"providers": "Fournisseurs",
|
||||
"image": "Images",
|
||||
"voice": "Voix",
|
||||
"browser": "Internet",
|
||||
"runtime": "Système",
|
||||
"advanced": "Sécurité",
|
||||
@ -99,7 +100,8 @@
|
||||
"mcp": "Services MCP",
|
||||
"apps": "Applications",
|
||||
"nativeHost": "Hôte natif",
|
||||
"hostSafety": "Sécurité de l’app"
|
||||
"hostSafety": "Sécurité de l’app",
|
||||
"voiceInput": "Saisie vocale"
|
||||
},
|
||||
"rows": {
|
||||
"theme": "Thème",
|
||||
@ -142,7 +144,13 @@
|
||||
"engine": "Moteur",
|
||||
"logs": "Journaux",
|
||||
"diagnostics": "Diagnostic",
|
||||
"contextWindow": "Fenêtre de contexte"
|
||||
"contextWindow": "Fenêtre de contexte",
|
||||
"transcription": "Transcription",
|
||||
"transcriptionProvider": "Fournisseur",
|
||||
"transcriptionProviderStatus": "Etat du fournisseur",
|
||||
"transcriptionModel": "Modele",
|
||||
"transcriptionLanguage": "Langue",
|
||||
"voiceLimits": "Limites"
|
||||
},
|
||||
"help": {
|
||||
"theme": "Basculer entre l’apparence claire et sombre.",
|
||||
@ -181,7 +189,12 @@
|
||||
"diagnostics": "Exporte un petit rapport d’exécution pour le support.",
|
||||
"localServiceAccessNative": "Autorise les commandes shell Full Access à atteindre les services sur ce Mac.",
|
||||
"webuiDefaultAccessNative": "Utilisé par les chats natifs sans permission propre au projet.",
|
||||
"contextWindow": "Choisissez le budget de contexte par défaut pour cette configuration de modèle."
|
||||
"contextWindow": "Choisissez le budget de contexte par défaut pour cette configuration de modèle.",
|
||||
"transcription": "Transcrit l'entree micro avant l'envoi. Les messages vocaux des canaux utilisent les memes reglages.",
|
||||
"transcriptionProvider": "Utilise les identifiants du fournisseur correspondant dans Fournisseurs.",
|
||||
"transcriptionProviderStatus": "Les cles API restent dans les fournisseurs, pas dans les reglages de transcription.",
|
||||
"transcriptionModel": "Laissez le modele resolu par defaut sauf si votre fournisseur exige un id personnalise.",
|
||||
"transcriptionLanguage": "Indice ISO-639 facultatif, comme en, zh, ja ou ko."
|
||||
},
|
||||
"values": {
|
||||
"light": "Clair",
|
||||
@ -283,6 +296,7 @@
|
||||
"totalProviders": "{{count}} disponibles",
|
||||
"webSearch": "Recherche web",
|
||||
"imageGeneration": "Génération d’images",
|
||||
"voiceInput": "Saisie vocale",
|
||||
"workspace": "Espace de travail"
|
||||
},
|
||||
"usage": {
|
||||
@ -486,6 +500,11 @@
|
||||
"rawInstructions": "SKILL.md brut",
|
||||
"rawInstructionsEmpty": "Aucune instruction brute.",
|
||||
"detailDescription": "Détails de {{name}}."
|
||||
},
|
||||
"voice": {
|
||||
"selectProvider": "Choisir un fournisseur",
|
||||
"configureProvider": "Configurer le fournisseur",
|
||||
"languageAuto": "Auto"
|
||||
}
|
||||
},
|
||||
"chat": {
|
||||
@ -678,6 +697,21 @@
|
||||
"deepResearch": "Recherche approfondie",
|
||||
"voice": "Entrée vocale"
|
||||
},
|
||||
"voice": {
|
||||
"hint": "Cliquez pour dicter ou maintenez",
|
||||
"stop": "Arrêter l'enregistrement",
|
||||
"transcribing": "Transcription...",
|
||||
"recordingStatus": "Enregistrement {{time}}"
|
||||
},
|
||||
"voiceErrors": {
|
||||
"unsupported": "La saisie vocale n'est pas prise en charge par ce navigateur.",
|
||||
"permission": "L'autorisation du microphone est requise.",
|
||||
"notConfigured": "Configurez d'abord un fournisseur de transcription.",
|
||||
"tooLong": "L'enregistrement est trop long.",
|
||||
"tooShort": "Maintenez un peu plus longtemps pour enregistrer la voix.",
|
||||
"noInput": "Aucune entrée microphone détectée.",
|
||||
"failed": "Impossible de transcrire l'audio."
|
||||
},
|
||||
"slash": {
|
||||
"ariaLabel": "Commandes slash",
|
||||
"label": "commandes",
|
||||
|
||||
@ -73,6 +73,7 @@
|
||||
"models": "Model",
|
||||
"providers": "Penyedia",
|
||||
"image": "Gambar",
|
||||
"voice": "Suara",
|
||||
"browser": "Internet",
|
||||
"runtime": "Sistem",
|
||||
"advanced": "Keamanan",
|
||||
@ -99,7 +100,8 @@
|
||||
"mcp": "Layanan MCP",
|
||||
"apps": "Aplikasi",
|
||||
"nativeHost": "Host native",
|
||||
"hostSafety": "Keamanan aplikasi"
|
||||
"hostSafety": "Keamanan aplikasi",
|
||||
"voiceInput": "Input suara"
|
||||
},
|
||||
"rows": {
|
||||
"theme": "Tema",
|
||||
@ -142,7 +144,13 @@
|
||||
"engine": "Mesin",
|
||||
"logs": "Log",
|
||||
"diagnostics": "Diagnostik",
|
||||
"contextWindow": "Jendela konteks"
|
||||
"contextWindow": "Jendela konteks",
|
||||
"transcription": "Transkripsi",
|
||||
"transcriptionProvider": "Penyedia",
|
||||
"transcriptionProviderStatus": "Status penyedia",
|
||||
"transcriptionModel": "Model",
|
||||
"transcriptionLanguage": "Bahasa",
|
||||
"voiceLimits": "Batas"
|
||||
},
|
||||
"help": {
|
||||
"theme": "Beralih antara tampilan terang dan gelap.",
|
||||
@ -181,7 +189,12 @@
|
||||
"diagnostics": "Exporta un pequeño informe de runtime para soporte.",
|
||||
"localServiceAccessNative": "Permite que comandos shell con Full Access alcancen servicios en este Mac.",
|
||||
"webuiDefaultAccessNative": "Usado por chats nativos sin permiso específico de proyecto.",
|
||||
"contextWindow": "Pilih anggaran konteks default untuk konfigurasi model ini."
|
||||
"contextWindow": "Pilih anggaran konteks default untuk konfigurasi model ini.",
|
||||
"transcription": "Transkripsikan input mikrofon sebelum dikirim. Pesan suara channel memakai pengaturan yang sama.",
|
||||
"transcriptionProvider": "Menggunakan kredensial penyedia yang sesuai dari Providers.",
|
||||
"transcriptionProviderStatus": "API key tetap berada di providers, bukan di pengaturan transkripsi.",
|
||||
"transcriptionModel": "Biarkan memakai default yang teresolusi kecuali penyedia membutuhkan id model khusus.",
|
||||
"transcriptionLanguage": "Petunjuk ISO-639 opsional, seperti en, zh, ja, atau ko."
|
||||
},
|
||||
"values": {
|
||||
"light": "Terang",
|
||||
@ -283,6 +296,7 @@
|
||||
"totalProviders": "{{count}} tersedia",
|
||||
"webSearch": "Pencarian web",
|
||||
"imageGeneration": "Pembuatan gambar",
|
||||
"voiceInput": "Input suara",
|
||||
"workspace": "Ruang kerja"
|
||||
},
|
||||
"usage": {
|
||||
@ -486,6 +500,11 @@
|
||||
"rawInstructions": "SKILL.md mentah",
|
||||
"rawInstructionsEmpty": "Tidak ada instruksi mentah.",
|
||||
"detailDescription": "Detail untuk {{name}}."
|
||||
},
|
||||
"voice": {
|
||||
"selectProvider": "Pilih penyedia",
|
||||
"configureProvider": "Konfigurasi penyedia",
|
||||
"languageAuto": "Auto"
|
||||
}
|
||||
},
|
||||
"chat": {
|
||||
@ -678,6 +697,21 @@
|
||||
"deepResearch": "Riset mendalam",
|
||||
"voice": "Input suara"
|
||||
},
|
||||
"voice": {
|
||||
"hint": "Klik untuk mendikte atau tahan",
|
||||
"stop": "Hentikan rekaman",
|
||||
"transcribing": "Mentranskripsi...",
|
||||
"recordingStatus": "Merekam {{time}}"
|
||||
},
|
||||
"voiceErrors": {
|
||||
"unsupported": "Input suara tidak didukung di browser ini.",
|
||||
"permission": "Izin mikrofon diperlukan.",
|
||||
"notConfigured": "Konfigurasikan penyedia transkripsi terlebih dahulu.",
|
||||
"tooLong": "Rekaman terlalu panjang.",
|
||||
"tooShort": "Tahan sedikit lebih lama untuk merekam suara.",
|
||||
"noInput": "Tidak ada input mikrofon yang terdeteksi.",
|
||||
"failed": "Tidak dapat mentranskripsi audio."
|
||||
},
|
||||
"slash": {
|
||||
"ariaLabel": "Perintah slash",
|
||||
"label": "perintah",
|
||||
|
||||
@ -73,6 +73,7 @@
|
||||
"models": "モデル",
|
||||
"providers": "プロバイダー",
|
||||
"image": "画像",
|
||||
"voice": "音声",
|
||||
"browser": "ウェブ",
|
||||
"runtime": "システム",
|
||||
"advanced": "セキュリティ",
|
||||
@ -99,7 +100,8 @@
|
||||
"mcp": "MCP サービス",
|
||||
"apps": "アプリ",
|
||||
"nativeHost": "ネイティブホスト",
|
||||
"hostSafety": "アプリの安全性"
|
||||
"hostSafety": "アプリの安全性",
|
||||
"voiceInput": "音声入力"
|
||||
},
|
||||
"rows": {
|
||||
"theme": "テーマ",
|
||||
@ -142,7 +144,13 @@
|
||||
"engine": "エンジン",
|
||||
"logs": "ログ",
|
||||
"diagnostics": "診断",
|
||||
"contextWindow": "コンテキストウィンドウ"
|
||||
"contextWindow": "コンテキストウィンドウ",
|
||||
"transcription": "文字起こし",
|
||||
"transcriptionProvider": "プロバイダー",
|
||||
"transcriptionProviderStatus": "プロバイダー状態",
|
||||
"transcriptionModel": "モデル",
|
||||
"transcriptionLanguage": "言語",
|
||||
"voiceLimits": "制限"
|
||||
},
|
||||
"help": {
|
||||
"theme": "ライト表示とダーク表示を切り替えます。",
|
||||
@ -181,7 +189,12 @@
|
||||
"diagnostics": "サポート用の小さなランタイムレポートを書き出します。",
|
||||
"localServiceAccessNative": "Full Access の shell コマンドがこの Mac 上のサービスにアクセスできるようにします。",
|
||||
"webuiDefaultAccessNative": "プロジェクト固有の権限がないネイティブチャットで使用します。",
|
||||
"contextWindow": "このモデル設定で使う既定のコンテキスト予算を選択します。"
|
||||
"contextWindow": "このモデル設定で使う既定のコンテキスト予算を選択します。",
|
||||
"transcription": "マイク入力を送信前に文字起こしします。チャネルの音声メッセージも同じ設定を使います。",
|
||||
"transcriptionProvider": "プロバイダー設定にある対応する認証情報を使います。",
|
||||
"transcriptionProviderStatus": "APIキーは文字起こし設定ではなくプロバイダー側に保存されます。",
|
||||
"transcriptionModel": "プロバイダーがカスタムモデルIDを必要としない限り、解決済みのデフォルトのままにします。",
|
||||
"transcriptionLanguage": "en、zh、ja、ko などの任意の ISO-639 ヒント。"
|
||||
},
|
||||
"values": {
|
||||
"light": "ライト",
|
||||
@ -283,6 +296,7 @@
|
||||
"totalProviders": "{{count}} 個利用可能",
|
||||
"webSearch": "Web 検索",
|
||||
"imageGeneration": "画像生成",
|
||||
"voiceInput": "音声入力",
|
||||
"workspace": "ワークスペース"
|
||||
},
|
||||
"usage": {
|
||||
@ -486,6 +500,11 @@
|
||||
"rawInstructions": "元の SKILL.md",
|
||||
"rawInstructionsEmpty": "元の説明はありません。",
|
||||
"detailDescription": "{{name}} の詳細。"
|
||||
},
|
||||
"voice": {
|
||||
"selectProvider": "プロバイダーを選択",
|
||||
"configureProvider": "プロバイダーを設定",
|
||||
"languageAuto": "自動"
|
||||
}
|
||||
},
|
||||
"chat": {
|
||||
@ -678,6 +697,21 @@
|
||||
"deepResearch": "詳細調査",
|
||||
"voice": "音声入力"
|
||||
},
|
||||
"voice": {
|
||||
"hint": "クリックして音声入力、または長押し",
|
||||
"stop": "録音を停止",
|
||||
"transcribing": "文字起こし中...",
|
||||
"recordingStatus": "録音中 {{time}}"
|
||||
},
|
||||
"voiceErrors": {
|
||||
"unsupported": "このブラウザーは音声入力に対応していません。",
|
||||
"permission": "マイクの許可が必要です。",
|
||||
"notConfigured": "先に文字起こしプロバイダーを設定してください。",
|
||||
"tooLong": "録音が長すぎます。",
|
||||
"tooShort": "もう少し長く録音してください。",
|
||||
"noInput": "マイク入力が検出されませんでした。",
|
||||
"failed": "音声を文字起こしできませんでした。"
|
||||
},
|
||||
"slash": {
|
||||
"ariaLabel": "スラッシュコマンド",
|
||||
"label": "コマンド",
|
||||
|
||||
@ -73,6 +73,7 @@
|
||||
"models": "모델",
|
||||
"providers": "제공자",
|
||||
"image": "이미지",
|
||||
"voice": "음성",
|
||||
"browser": "웹",
|
||||
"runtime": "시스템",
|
||||
"advanced": "보안",
|
||||
@ -99,7 +100,8 @@
|
||||
"mcp": "MCP 서비스",
|
||||
"apps": "앱",
|
||||
"nativeHost": "네이티브 호스트",
|
||||
"hostSafety": "앱 보안"
|
||||
"hostSafety": "앱 보안",
|
||||
"voiceInput": "음성 입력"
|
||||
},
|
||||
"rows": {
|
||||
"theme": "테마",
|
||||
@ -142,7 +144,13 @@
|
||||
"engine": "엔진",
|
||||
"logs": "로그",
|
||||
"diagnostics": "진단",
|
||||
"contextWindow": "컨텍스트 창"
|
||||
"contextWindow": "컨텍스트 창",
|
||||
"transcription": "전사",
|
||||
"transcriptionProvider": "제공자",
|
||||
"transcriptionProviderStatus": "제공자 상태",
|
||||
"transcriptionModel": "모델",
|
||||
"transcriptionLanguage": "언어",
|
||||
"voiceLimits": "제한"
|
||||
},
|
||||
"help": {
|
||||
"theme": "밝은 모드와 어두운 모드를 전환합니다.",
|
||||
@ -181,7 +189,12 @@
|
||||
"diagnostics": "지원용 작은 런타임 보고서를 내보냅니다.",
|
||||
"localServiceAccessNative": "Full Access shell 명령이 이 Mac의 서비스에 접근할 수 있게 합니다.",
|
||||
"webuiDefaultAccessNative": "프로젝트별 권한이 없는 네이티브 채팅에 사용됩니다.",
|
||||
"contextWindow": "이 모델 구성의 기본 컨텍스트 예산을 선택합니다."
|
||||
"contextWindow": "이 모델 구성의 기본 컨텍스트 예산을 선택합니다.",
|
||||
"transcription": "마이크 입력을 보내기 전에 텍스트로 변환합니다. 채널 음성 메시지도 같은 설정을 사용합니다.",
|
||||
"transcriptionProvider": "Providers에 저장된 해당 제공자의 인증 정보를 사용합니다.",
|
||||
"transcriptionProviderStatus": "API 키는 transcription 설정이 아니라 providers 아래에 유지됩니다.",
|
||||
"transcriptionModel": "제공자가 사용자 지정 모델 ID를 요구하지 않으면 해석된 기본값을 사용하세요.",
|
||||
"transcriptionLanguage": "en, zh, ja, ko 같은 선택적 ISO-639 힌트입니다."
|
||||
},
|
||||
"values": {
|
||||
"light": "라이트",
|
||||
@ -283,6 +296,7 @@
|
||||
"totalProviders": "{{count}}개 사용 가능",
|
||||
"webSearch": "웹 검색",
|
||||
"imageGeneration": "이미지 생성",
|
||||
"voiceInput": "음성 입력",
|
||||
"workspace": "작업공간"
|
||||
},
|
||||
"usage": {
|
||||
@ -486,6 +500,11 @@
|
||||
"rawInstructions": "원본 SKILL.md",
|
||||
"rawInstructionsEmpty": "원본 지침이 없습니다.",
|
||||
"detailDescription": "{{name}} 세부 정보."
|
||||
},
|
||||
"voice": {
|
||||
"selectProvider": "제공자 선택",
|
||||
"configureProvider": "제공자 설정",
|
||||
"languageAuto": "자동"
|
||||
}
|
||||
},
|
||||
"chat": {
|
||||
@ -678,6 +697,21 @@
|
||||
"deepResearch": "심층 조사",
|
||||
"voice": "음성 입력"
|
||||
},
|
||||
"voice": {
|
||||
"hint": "클릭해 받아쓰거나 길게 누르기",
|
||||
"stop": "녹음 중지",
|
||||
"transcribing": "변환 중...",
|
||||
"recordingStatus": "녹음 중 {{time}}"
|
||||
},
|
||||
"voiceErrors": {
|
||||
"unsupported": "이 브라우저는 음성 입력을 지원하지 않습니다.",
|
||||
"permission": "마이크 권한이 필요합니다.",
|
||||
"notConfigured": "먼저 음성 변환 제공업체를 설정하세요.",
|
||||
"tooLong": "녹음 시간이 너무 깁니다.",
|
||||
"tooShort": "음성을 녹음하려면 조금 더 길게 눌러 주세요.",
|
||||
"noInput": "마이크 입력이 감지되지 않았습니다.",
|
||||
"failed": "오디오를 변환하지 못했습니다."
|
||||
},
|
||||
"slash": {
|
||||
"ariaLabel": "슬래시 명령",
|
||||
"label": "명령",
|
||||
|
||||
@ -73,6 +73,7 @@
|
||||
"models": "Mô hình",
|
||||
"providers": "Nhà cung cấp",
|
||||
"image": "Hình ảnh",
|
||||
"voice": "Giọng nói",
|
||||
"browser": "Trang web",
|
||||
"runtime": "Hệ thống",
|
||||
"advanced": "Bảo mật",
|
||||
@ -99,7 +100,8 @@
|
||||
"mcp": "Dịch vụ MCP",
|
||||
"apps": "Ứng dụng",
|
||||
"nativeHost": "Host gốc",
|
||||
"hostSafety": "An toàn ứng dụng"
|
||||
"hostSafety": "An toàn ứng dụng",
|
||||
"voiceInput": "Nhap giong noi"
|
||||
},
|
||||
"rows": {
|
||||
"theme": "Chủ đề",
|
||||
@ -142,7 +144,13 @@
|
||||
"engine": "Bộ máy",
|
||||
"logs": "Nhật ký",
|
||||
"diagnostics": "Chẩn đoán",
|
||||
"contextWindow": "Cửa sổ ngữ cảnh"
|
||||
"contextWindow": "Cửa sổ ngữ cảnh",
|
||||
"transcription": "Phien am",
|
||||
"transcriptionProvider": "Nha cung cap",
|
||||
"transcriptionProviderStatus": "Trang thai nha cung cap",
|
||||
"transcriptionModel": "Mo hinh",
|
||||
"transcriptionLanguage": "Ngon ngu",
|
||||
"voiceLimits": "Gioi han"
|
||||
},
|
||||
"help": {
|
||||
"theme": "Chuyển giữa giao diện sáng và tối.",
|
||||
@ -181,7 +189,12 @@
|
||||
"diagnostics": "Exporta un pequeño informe de runtime para soporte.",
|
||||
"localServiceAccessNative": "Permite que comandos shell con Full Access alcancen servicios en este Mac.",
|
||||
"webuiDefaultAccessNative": "Usado por chats nativos sin permiso específico de proyecto.",
|
||||
"contextWindow": "Chọn ngân sách ngữ cảnh mặc định cho cấu hình mô hình này."
|
||||
"contextWindow": "Chọn ngân sách ngữ cảnh mặc định cho cấu hình mô hình này.",
|
||||
"transcription": "Phien am dau vao micro truoc khi gui. Tin nhan giong noi tu kenh chat dung cung cai dat.",
|
||||
"transcriptionProvider": "Dung thong tin xac thuc cua nha cung cap tu Providers.",
|
||||
"transcriptionProviderStatus": "API key nam trong providers, khong nam trong cai dat transcription.",
|
||||
"transcriptionModel": "Giu mac dinh da resolve tru khi nha cung cap can id model tuy chinh.",
|
||||
"transcriptionLanguage": "Goi y ISO-639 tuy chon, nhu en, zh, ja hoac ko."
|
||||
},
|
||||
"values": {
|
||||
"light": "Sáng",
|
||||
@ -283,6 +296,7 @@
|
||||
"totalProviders": "{{count}} khả dụng",
|
||||
"webSearch": "Tìm kiếm web",
|
||||
"imageGeneration": "Tạo hình ảnh",
|
||||
"voiceInput": "Nhập bằng giọng nói",
|
||||
"workspace": "Không gian làm việc"
|
||||
},
|
||||
"usage": {
|
||||
@ -486,6 +500,11 @@
|
||||
"rawInstructions": "SKILL.md gốc",
|
||||
"rawInstructionsEmpty": "Không có hướng dẫn gốc.",
|
||||
"detailDescription": "Chi tiết cho {{name}}."
|
||||
},
|
||||
"voice": {
|
||||
"selectProvider": "Chon nha cung cap",
|
||||
"configureProvider": "Cau hinh nha cung cap",
|
||||
"languageAuto": "Tu dong"
|
||||
}
|
||||
},
|
||||
"chat": {
|
||||
@ -678,6 +697,21 @@
|
||||
"deepResearch": "Nghiên cứu sâu",
|
||||
"voice": "Nhập bằng giọng nói"
|
||||
},
|
||||
"voice": {
|
||||
"hint": "Bấm để đọc chính tả hoặc nhấn giữ",
|
||||
"stop": "Dừng ghi âm",
|
||||
"transcribing": "Đang chép lời...",
|
||||
"recordingStatus": "Đang ghi {{time}}"
|
||||
},
|
||||
"voiceErrors": {
|
||||
"unsupported": "Trình duyệt này không hỗ trợ nhập bằng giọng nói.",
|
||||
"permission": "Cần quyền truy cập micrô.",
|
||||
"notConfigured": "Hãy cấu hình nhà cung cấp chép lời trước.",
|
||||
"tooLong": "Bản ghi âm quá dài.",
|
||||
"tooShort": "Giữ lâu hơn một chút để ghi âm giọng nói.",
|
||||
"noInput": "Không phát hiện đầu vào micrô.",
|
||||
"failed": "Không thể chép lời âm thanh."
|
||||
},
|
||||
"slash": {
|
||||
"ariaLabel": "Lệnh slash",
|
||||
"label": "lệnh",
|
||||
|
||||
@ -73,6 +73,7 @@
|
||||
"models": "模型",
|
||||
"providers": "提供商",
|
||||
"image": "图片",
|
||||
"voice": "语音",
|
||||
"browser": "网页",
|
||||
"cliApps": "CLI 应用",
|
||||
"mcp": "MCP",
|
||||
@ -99,7 +100,8 @@
|
||||
"capabilities": "能力",
|
||||
"apps": "应用",
|
||||
"nativeHost": "原生宿主",
|
||||
"hostSafety": "应用安全"
|
||||
"hostSafety": "应用安全",
|
||||
"voiceInput": "语音识别"
|
||||
},
|
||||
"models": {
|
||||
"selectModel": "选择模型",
|
||||
@ -161,7 +163,13 @@
|
||||
"engine": "引擎",
|
||||
"logs": "日志",
|
||||
"diagnostics": "诊断",
|
||||
"contextWindow": "上下文窗口"
|
||||
"contextWindow": "上下文窗口",
|
||||
"transcription": "语音转写",
|
||||
"transcriptionProvider": "提供商",
|
||||
"transcriptionProviderStatus": "提供商状态",
|
||||
"transcriptionModel": "模型",
|
||||
"transcriptionLanguage": "语言",
|
||||
"voiceLimits": "限制"
|
||||
},
|
||||
"help": {
|
||||
"theme": "在浅色和深色外观之间切换。",
|
||||
@ -200,7 +208,12 @@
|
||||
"diagnostics": "导出一份用于支持排查的小型运行报告。",
|
||||
"localServiceAccessNative": "允许完全访问权限下的 shell 命令访问这台 Mac 上的服务。",
|
||||
"webuiDefaultAccessNative": "用于没有单独项目权限的原生聊天。",
|
||||
"contextWindow": "选择此模型配置的默认上下文预算。"
|
||||
"contextWindow": "选择此模型配置的默认上下文预算。",
|
||||
"transcription": "发送前先把麦克风输入转写到输入框。聊天渠道里的语音消息也使用同一套设置。",
|
||||
"transcriptionProvider": "使用「提供商」中对应提供商的凭据。",
|
||||
"transcriptionProviderStatus": "API Key 仍保存在 providers 里,不写进 transcription 设置。",
|
||||
"transcriptionModel": "除非提供商需要自定义模型 ID,否则保持解析后的默认值即可。",
|
||||
"transcriptionLanguage": "可选 ISO-639 语言提示,例如 en、zh、ja 或 ko。"
|
||||
},
|
||||
"timezone": {
|
||||
"select": "选择时区",
|
||||
@ -391,6 +404,7 @@
|
||||
"totalProviders": "共 {{count}} 个可用",
|
||||
"webSearch": "网页搜索",
|
||||
"imageGeneration": "图片生成",
|
||||
"voiceInput": "语音识别",
|
||||
"workspace": "工作区"
|
||||
},
|
||||
"usage": {
|
||||
@ -486,6 +500,11 @@
|
||||
"rawInstructions": "原始 SKILL.md",
|
||||
"rawInstructionsEmpty": "没有原始说明。",
|
||||
"detailDescription": "{{name}} 的详情。"
|
||||
},
|
||||
"voice": {
|
||||
"selectProvider": "选择提供商",
|
||||
"configureProvider": "配置提供商",
|
||||
"languageAuto": "自动"
|
||||
}
|
||||
},
|
||||
"chat": {
|
||||
@ -677,6 +696,21 @@
|
||||
"deepResearch": "深度研究",
|
||||
"voice": "语音输入"
|
||||
},
|
||||
"voice": {
|
||||
"hint": "点击进行听写或长按",
|
||||
"stop": "停止录音",
|
||||
"transcribing": "正在转写...",
|
||||
"recordingStatus": "正在录音 {{time}}"
|
||||
},
|
||||
"voiceErrors": {
|
||||
"unsupported": "当前浏览器不支持语音输入。",
|
||||
"permission": "需要麦克风权限。",
|
||||
"notConfigured": "请先配置转写提供商。",
|
||||
"tooLong": "录音时间太长。",
|
||||
"tooShort": "请稍微多录一会儿。",
|
||||
"noInput": "没有检测到麦克风输入。",
|
||||
"failed": "语音转写失败。"
|
||||
},
|
||||
"slash": {
|
||||
"ariaLabel": "斜杠命令",
|
||||
"label": "命令",
|
||||
|
||||
@ -73,6 +73,7 @@
|
||||
"models": "模型",
|
||||
"providers": "提供商",
|
||||
"image": "圖片",
|
||||
"voice": "語音",
|
||||
"browser": "網頁",
|
||||
"runtime": "系統",
|
||||
"advanced": "安全",
|
||||
@ -99,7 +100,8 @@
|
||||
"mcp": "MCP 服務",
|
||||
"apps": "應用",
|
||||
"nativeHost": "原生宿主",
|
||||
"hostSafety": "App 安全"
|
||||
"hostSafety": "App 安全",
|
||||
"voiceInput": "語音辨識"
|
||||
},
|
||||
"rows": {
|
||||
"theme": "主題",
|
||||
@ -142,7 +144,13 @@
|
||||
"engine": "引擎",
|
||||
"logs": "日誌",
|
||||
"diagnostics": "診斷",
|
||||
"contextWindow": "上下文視窗"
|
||||
"contextWindow": "上下文視窗",
|
||||
"transcription": "語音轉寫",
|
||||
"transcriptionProvider": "提供商",
|
||||
"transcriptionProviderStatus": "提供商狀態",
|
||||
"transcriptionModel": "模型",
|
||||
"transcriptionLanguage": "語言",
|
||||
"voiceLimits": "限制"
|
||||
},
|
||||
"help": {
|
||||
"theme": "在淺色與深色外觀之間切換。",
|
||||
@ -181,7 +189,12 @@
|
||||
"diagnostics": "匯出一份用於支援排查的小型執行報告。",
|
||||
"localServiceAccessNative": "允許完全訪問權限下的 shell 命令訪問這台 Mac 上的服務。",
|
||||
"webuiDefaultAccessNative": "用於沒有單獨專案權限的原生聊天。",
|
||||
"contextWindow": "選擇此模型配置的預設上下文預算。"
|
||||
"contextWindow": "選擇此模型配置的預設上下文預算。",
|
||||
"transcription": "送出前先把麥克風輸入轉寫到輸入框。聊天渠道的語音訊息也使用同一組設定。",
|
||||
"transcriptionProvider": "使用「提供商」中對應提供商的憑證。",
|
||||
"transcriptionProviderStatus": "API Key 仍保存在 providers 裡,不寫進 transcription 設定。",
|
||||
"transcriptionModel": "除非提供商需要自訂模型 ID,否則保持解析後的預設值即可。",
|
||||
"transcriptionLanguage": "可選 ISO-639 語言提示,例如 en、zh、ja 或 ko。"
|
||||
},
|
||||
"values": {
|
||||
"light": "淺色",
|
||||
@ -283,6 +296,7 @@
|
||||
"totalProviders": "共 {{count}} 個可用",
|
||||
"webSearch": "網頁搜尋",
|
||||
"imageGeneration": "圖片生成",
|
||||
"voiceInput": "語音辨識",
|
||||
"workspace": "工作區"
|
||||
},
|
||||
"usage": {
|
||||
@ -486,6 +500,11 @@
|
||||
"rawInstructions": "原始 SKILL.md",
|
||||
"rawInstructionsEmpty": "沒有原始說明。",
|
||||
"detailDescription": "{{name}} 的詳細資訊。"
|
||||
},
|
||||
"voice": {
|
||||
"selectProvider": "選擇提供商",
|
||||
"configureProvider": "設定提供商",
|
||||
"languageAuto": "自動"
|
||||
}
|
||||
},
|
||||
"chat": {
|
||||
@ -678,6 +697,21 @@
|
||||
"deepResearch": "深度研究",
|
||||
"voice": "語音輸入"
|
||||
},
|
||||
"voice": {
|
||||
"hint": "點擊進行聽寫或長按",
|
||||
"stop": "停止錄音",
|
||||
"transcribing": "正在轉寫...",
|
||||
"recordingStatus": "正在錄音 {{time}}"
|
||||
},
|
||||
"voiceErrors": {
|
||||
"unsupported": "目前瀏覽器不支援語音輸入。",
|
||||
"permission": "需要麥克風權限。",
|
||||
"notConfigured": "請先設定轉寫提供商。",
|
||||
"tooLong": "錄音時間太長。",
|
||||
"tooShort": "請稍微多錄一會兒。",
|
||||
"noInput": "沒有偵測到麥克風輸入。",
|
||||
"failed": "語音轉寫失敗。"
|
||||
},
|
||||
"slash": {
|
||||
"ariaLabel": "斜線命令",
|
||||
"label": "命令",
|
||||
|
||||
210
webui/src/lib/ansi.ts
Normal file
210
webui/src/lib/ansi.ts
Normal file
@ -0,0 +1,210 @@
|
||||
export type AnsiSegment = {
|
||||
text: string;
|
||||
style?: AnsiStyle;
|
||||
};
|
||||
|
||||
export type AnsiStyle = {
|
||||
backgroundColor?: string;
|
||||
color?: string;
|
||||
fontStyle?: "italic";
|
||||
fontWeight?: number;
|
||||
opacity?: number;
|
||||
textDecorationLine?: "underline";
|
||||
};
|
||||
|
||||
type AnsiState = {
|
||||
backgroundColor?: string;
|
||||
bold: boolean;
|
||||
color?: string;
|
||||
dim: boolean;
|
||||
inverse: boolean;
|
||||
italic: boolean;
|
||||
underline: boolean;
|
||||
};
|
||||
|
||||
const ESC = String.fromCharCode(27);
|
||||
const ANSI_PATTERN = new RegExp(`${ESC}\\[[0-?]*[ -/]*[@-~]`, "g");
|
||||
|
||||
const ANSI_COLORS = [
|
||||
"#000000",
|
||||
"#cd3131",
|
||||
"#0dbc79",
|
||||
"#e5e510",
|
||||
"#2472c8",
|
||||
"#bc3fbc",
|
||||
"#11a8cd",
|
||||
"#e5e5e5",
|
||||
];
|
||||
|
||||
const ANSI_BRIGHT_COLORS = [
|
||||
"#666666",
|
||||
"#f14c4c",
|
||||
"#23d18b",
|
||||
"#f5f543",
|
||||
"#3b8eea",
|
||||
"#d670d6",
|
||||
"#29b8db",
|
||||
"#ffffff",
|
||||
];
|
||||
|
||||
const RGB_STEPS = [0, 95, 135, 175, 215, 255];
|
||||
|
||||
export function hasAnsi(value: string): boolean {
|
||||
ANSI_PATTERN.lastIndex = 0;
|
||||
return ANSI_PATTERN.test(value);
|
||||
}
|
||||
|
||||
export function stripAnsi(value: string): string {
|
||||
ANSI_PATTERN.lastIndex = 0;
|
||||
return value.replace(ANSI_PATTERN, "");
|
||||
}
|
||||
|
||||
function initialState(): AnsiState {
|
||||
return {
|
||||
bold: false,
|
||||
dim: false,
|
||||
inverse: false,
|
||||
italic: false,
|
||||
underline: false,
|
||||
};
|
||||
}
|
||||
|
||||
function colorFrom256(value: number): string | undefined {
|
||||
if (value < 0 || value > 255) return undefined;
|
||||
if (value < 8) return ANSI_COLORS[value];
|
||||
if (value < 16) return ANSI_BRIGHT_COLORS[value - 8];
|
||||
if (value < 232) {
|
||||
const offset = value - 16;
|
||||
const red = RGB_STEPS[Math.floor(offset / 36)];
|
||||
const green = RGB_STEPS[Math.floor((offset % 36) / 6)];
|
||||
const blue = RGB_STEPS[offset % 6];
|
||||
return `rgb(${red}, ${green}, ${blue})`;
|
||||
}
|
||||
const gray = 8 + ((value - 232) * 10);
|
||||
return `rgb(${gray}, ${gray}, ${gray})`;
|
||||
}
|
||||
|
||||
function colorFromRgb(red: number, green: number, blue: number): string | undefined {
|
||||
if ([red, green, blue].some((value) => !Number.isFinite(value) || value < 0 || value > 255)) {
|
||||
return undefined;
|
||||
}
|
||||
return `rgb(${red}, ${green}, ${blue})`;
|
||||
}
|
||||
|
||||
function normalizedSgrParams(sequence: string): number[] | null {
|
||||
if (!sequence.endsWith("m")) return null;
|
||||
const body = sequence.slice(2, -1).trim();
|
||||
if (!body) return [0];
|
||||
return body.split(/[;:]/).map((part) => {
|
||||
const value = Number.parseInt(part || "0", 10);
|
||||
return Number.isFinite(value) ? value : 0;
|
||||
});
|
||||
}
|
||||
|
||||
function applyExtendedColor(
|
||||
state: AnsiState,
|
||||
params: number[],
|
||||
index: number,
|
||||
key: "color" | "backgroundColor",
|
||||
): number {
|
||||
const mode = params[index + 1];
|
||||
if (mode === 5) {
|
||||
const color = colorFrom256(params[index + 2]);
|
||||
if (color) state[key] = color;
|
||||
return index + 2;
|
||||
}
|
||||
if (mode === 2) {
|
||||
const color = colorFromRgb(params[index + 2], params[index + 3], params[index + 4]);
|
||||
if (color) state[key] = color;
|
||||
return index + 4;
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
function applySgrParams(state: AnsiState, params: number[]): void {
|
||||
for (let index = 0; index < params.length; index += 1) {
|
||||
const code = params[index];
|
||||
if (code === 0) {
|
||||
Object.assign(state, initialState());
|
||||
} else if (code === 1) {
|
||||
state.bold = true;
|
||||
state.dim = false;
|
||||
} else if (code === 2) {
|
||||
state.dim = true;
|
||||
state.bold = false;
|
||||
} else if (code === 3) {
|
||||
state.italic = true;
|
||||
} else if (code === 4) {
|
||||
state.underline = true;
|
||||
} else if (code === 7) {
|
||||
state.inverse = true;
|
||||
} else if (code === 22) {
|
||||
state.bold = false;
|
||||
state.dim = false;
|
||||
} else if (code === 23) {
|
||||
state.italic = false;
|
||||
} else if (code === 24) {
|
||||
state.underline = false;
|
||||
} else if (code === 27) {
|
||||
state.inverse = false;
|
||||
} else if (code === 39) {
|
||||
delete state.color;
|
||||
} else if (code === 49) {
|
||||
delete state.backgroundColor;
|
||||
} else if (code >= 30 && code <= 37) {
|
||||
state.color = ANSI_COLORS[code - 30];
|
||||
} else if (code >= 40 && code <= 47) {
|
||||
state.backgroundColor = ANSI_COLORS[code - 40];
|
||||
} else if (code >= 90 && code <= 97) {
|
||||
state.color = ANSI_BRIGHT_COLORS[code - 90];
|
||||
} else if (code >= 100 && code <= 107) {
|
||||
state.backgroundColor = ANSI_BRIGHT_COLORS[code - 100];
|
||||
} else if (code === 38) {
|
||||
index = applyExtendedColor(state, params, index, "color");
|
||||
} else if (code === 48) {
|
||||
index = applyExtendedColor(state, params, index, "backgroundColor");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function styleFromState(state: AnsiState): AnsiStyle | undefined {
|
||||
const foreground = state.inverse ? state.backgroundColor : state.color;
|
||||
const background = state.inverse ? state.color : state.backgroundColor;
|
||||
const style: AnsiStyle = {};
|
||||
if (foreground) style.color = foreground;
|
||||
if (background) style.backgroundColor = background;
|
||||
if (state.bold) style.fontWeight = 700;
|
||||
if (state.dim) style.opacity = 0.72;
|
||||
if (state.italic) style.fontStyle = "italic";
|
||||
if (state.underline) style.textDecorationLine = "underline";
|
||||
return Object.keys(style).length ? style : undefined;
|
||||
}
|
||||
|
||||
export function parseAnsiSegments(value: string): AnsiSegment[] {
|
||||
const segments: AnsiSegment[] = [];
|
||||
const state = initialState();
|
||||
let cursor = 0;
|
||||
ANSI_PATTERN.lastIndex = 0;
|
||||
|
||||
for (const match of value.matchAll(ANSI_PATTERN)) {
|
||||
const index = match.index ?? 0;
|
||||
if (index > cursor) {
|
||||
segments.push({
|
||||
text: value.slice(cursor, index),
|
||||
style: styleFromState(state),
|
||||
});
|
||||
}
|
||||
const params = normalizedSgrParams(match[0]);
|
||||
if (params) applySgrParams(state, params);
|
||||
cursor = index + match[0].length;
|
||||
}
|
||||
|
||||
if (cursor < value.length) {
|
||||
segments.push({
|
||||
text: value.slice(cursor),
|
||||
style: styleFromState(state),
|
||||
});
|
||||
}
|
||||
|
||||
return segments.filter((segment) => segment.text.length > 0);
|
||||
}
|
||||
@ -16,6 +16,7 @@ import type {
|
||||
SkillDetail,
|
||||
SkillsPayload,
|
||||
SlashCommand,
|
||||
TranscriptionSettingsUpdate,
|
||||
WebSearchSettingsUpdate,
|
||||
WorkspacesPayload,
|
||||
WebuiThreadPersistedPayload,
|
||||
@ -547,3 +548,21 @@ export async function updateImageGenerationSettings(
|
||||
token,
|
||||
);
|
||||
}
|
||||
|
||||
export async function updateTranscriptionSettings(
|
||||
token: string,
|
||||
update: TranscriptionSettingsUpdate,
|
||||
base: string = "",
|
||||
): Promise<SettingsPayload> {
|
||||
const query = new URLSearchParams();
|
||||
query.set("enabled", String(update.enabled));
|
||||
query.set("provider", update.provider);
|
||||
query.set("model", update.model);
|
||||
query.set("language", update.language);
|
||||
query.set("max_duration_sec", String(update.maxDurationSec));
|
||||
query.set("max_upload_mb", String(update.maxUploadMb));
|
||||
return request<SettingsPayload>(
|
||||
`${base}/api/settings/transcription/update?${query}`,
|
||||
token,
|
||||
);
|
||||
}
|
||||
|
||||
@ -95,6 +95,12 @@ interface PendingNewChat {
|
||||
timer: ReturnType<typeof setTimeout>;
|
||||
}
|
||||
|
||||
interface PendingTranscription {
|
||||
resolve: (text: string) => void;
|
||||
reject: (err: Error) => void;
|
||||
timer: ReturnType<typeof setTimeout>;
|
||||
}
|
||||
|
||||
export interface NanobotClientOptions {
|
||||
url: string;
|
||||
reconnect?: boolean;
|
||||
@ -132,6 +138,7 @@ export class NanobotClient {
|
||||
/** Latest ``goal_state`` snapshot per ``chat_id`` (multi-session isolation). */
|
||||
private goalStateByChatId = new Map<string, GoalStateWsPayload>();
|
||||
private pendingNewChat: PendingNewChat | null = null;
|
||||
private pendingTranscriptions = new Map<string, PendingTranscription>();
|
||||
// Frames queued while the socket is not yet OPEN
|
||||
private sendQueue: Outbound[] = [];
|
||||
private reconnectAttempts = 0;
|
||||
@ -320,6 +327,27 @@ export class NanobotClient {
|
||||
});
|
||||
}
|
||||
|
||||
transcribeAudio(
|
||||
dataUrl: string,
|
||||
options?: { durationMs?: number; timeoutMs?: number },
|
||||
): Promise<string> {
|
||||
const requestId = crypto.randomUUID();
|
||||
const timeoutMs = options?.timeoutMs ?? 120_000;
|
||||
return new Promise<string>((resolve, reject) => {
|
||||
const timer = setTimeout(() => {
|
||||
this.pendingTranscriptions.delete(requestId);
|
||||
reject(new Error("transcription timed out"));
|
||||
}, timeoutMs);
|
||||
this.pendingTranscriptions.set(requestId, { resolve, reject, timer });
|
||||
this.queueSend({
|
||||
type: "transcribe_audio",
|
||||
request_id: requestId,
|
||||
data_url: dataUrl,
|
||||
...(options?.durationMs !== undefined ? { duration_ms: options.durationMs } : {}),
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
attach(chatId: string): void {
|
||||
this.knownChats.add(chatId);
|
||||
if (this.socket?.readyState === WS_OPEN) {
|
||||
@ -425,6 +453,16 @@ export class NanobotClient {
|
||||
return;
|
||||
}
|
||||
|
||||
if (parsed.event === "transcription_result") {
|
||||
this.resolveTranscription(parsed.request_id, parsed.text);
|
||||
return;
|
||||
}
|
||||
|
||||
if (parsed.event === "transcription_error") {
|
||||
this.rejectTranscription(parsed.request_id, parsed.detail || "error");
|
||||
return;
|
||||
}
|
||||
|
||||
if (parsed.event === "session_updated") {
|
||||
this.emitSessionUpdate(parsed.chat_id, parsed.scope, parsed.workspace_scope);
|
||||
return;
|
||||
@ -500,6 +538,7 @@ export class NanobotClient {
|
||||
this.pendingNewChat.reject(new Error("socket closed"));
|
||||
this.pendingNewChat = null;
|
||||
}
|
||||
this.rejectAllTranscriptions("socket closed");
|
||||
// Surface structured reasons *before* reconnect logic so the UI can
|
||||
// display the error even while the client transparently reconnects.
|
||||
// Browsers populate ``CloseEvent.code`` with the wire-level close code;
|
||||
@ -528,6 +567,34 @@ export class NanobotClient {
|
||||
}
|
||||
}
|
||||
|
||||
private resolveTranscription(requestId: string, text: string): void {
|
||||
const pending = this.pendingTranscriptions.get(requestId);
|
||||
if (!pending) return;
|
||||
clearTimeout(pending.timer);
|
||||
this.pendingTranscriptions.delete(requestId);
|
||||
pending.resolve(text);
|
||||
}
|
||||
|
||||
private rejectTranscription(requestId: string | undefined, detail: string): void {
|
||||
if (!requestId) {
|
||||
this.rejectAllTranscriptions(detail);
|
||||
return;
|
||||
}
|
||||
const pending = this.pendingTranscriptions.get(requestId);
|
||||
if (!pending) return;
|
||||
clearTimeout(pending.timer);
|
||||
this.pendingTranscriptions.delete(requestId);
|
||||
pending.reject(new Error(detail));
|
||||
}
|
||||
|
||||
private rejectAllTranscriptions(detail: string): void {
|
||||
for (const [requestId, pending] of this.pendingTranscriptions) {
|
||||
clearTimeout(pending.timer);
|
||||
pending.reject(new Error(detail));
|
||||
this.pendingTranscriptions.delete(requestId);
|
||||
}
|
||||
}
|
||||
|
||||
private scheduleReconnect(): void {
|
||||
this.setStatus("reconnecting");
|
||||
const attempt = this.reconnectAttempts++;
|
||||
|
||||
@ -391,6 +391,23 @@ export interface SettingsPayload {
|
||||
default_api_base?: string | null;
|
||||
}>;
|
||||
};
|
||||
transcription?: {
|
||||
enabled: boolean;
|
||||
provider: string;
|
||||
provider_configured: boolean;
|
||||
model: string;
|
||||
language: string | null;
|
||||
max_duration_sec: number;
|
||||
max_upload_mb: number;
|
||||
providers: Array<{
|
||||
name: string;
|
||||
label: string;
|
||||
configured: boolean;
|
||||
api_key_hint?: string | null;
|
||||
api_base?: string | null;
|
||||
default_api_base?: string | null;
|
||||
}>;
|
||||
};
|
||||
runtime: {
|
||||
config_path: string;
|
||||
workspace_path: string;
|
||||
@ -680,6 +697,15 @@ export interface ImageGenerationSettingsUpdate {
|
||||
maxImagesPerTurn: number;
|
||||
}
|
||||
|
||||
export interface TranscriptionSettingsUpdate {
|
||||
enabled: boolean;
|
||||
provider: string;
|
||||
model: string;
|
||||
language: string;
|
||||
maxDurationSec: number;
|
||||
maxUploadMb: number;
|
||||
}
|
||||
|
||||
export interface SlashCommand {
|
||||
command: string;
|
||||
title: string;
|
||||
@ -782,6 +808,13 @@ export type InboundEvent =
|
||||
scope?: "metadata" | "thread" | string;
|
||||
workspace_scope?: WorkspaceScopePayload;
|
||||
}
|
||||
| { event: "transcription_result"; request_id: string; text: string }
|
||||
| {
|
||||
event: "transcription_error";
|
||||
request_id?: string;
|
||||
detail?: string;
|
||||
provider?: string;
|
||||
}
|
||||
| { event: "error"; chat_id?: string; detail?: string; reason?: string };
|
||||
|
||||
/** Base64-encoded image attached to an outbound ``message`` envelope.
|
||||
@ -845,6 +878,7 @@ export type Outbound =
|
||||
| { type: "new_chat"; workspace_scope?: WorkspaceScopePayload }
|
||||
| { type: "attach"; chat_id: string }
|
||||
| { type: "set_workspace_scope"; chat_id: string; workspace_scope: WorkspaceScopePayload }
|
||||
| { type: "transcribe_audio"; request_id: string; data_url: string; duration_ms?: number }
|
||||
| {
|
||||
type: "message";
|
||||
chat_id: string;
|
||||
|
||||
@ -1172,13 +1172,13 @@ describe("App layout", () => {
|
||||
|
||||
it("restores the settings section from the URL hash after a page reload", async () => {
|
||||
mockFetchRoutes({ "/api/settings": baseSettingsPayload() });
|
||||
window.history.replaceState(null, "", "/#/settings?section=models");
|
||||
window.history.replaceState(null, "", "/#/settings?section=voice");
|
||||
|
||||
render(<App />);
|
||||
|
||||
await waitFor(() => expect(connectSpy).toHaveBeenCalled());
|
||||
expect(await screen.findByRole("heading", { name: "Models" })).toBeInTheDocument();
|
||||
expect(window.location.hash).toBe("#/settings?section=models");
|
||||
expect(await screen.findByRole("heading", { name: "Voice input" })).toBeInTheDocument();
|
||||
expect(window.location.hash).toBe("#/settings?section=voice");
|
||||
});
|
||||
|
||||
it("updates the URL hash when switching settings sections", async () => {
|
||||
@ -1197,6 +1197,11 @@ describe("App layout", () => {
|
||||
|
||||
expect(await screen.findByRole("heading", { name: "Models" })).toBeInTheDocument();
|
||||
expect(window.location.hash).toBe("#/settings?section=models");
|
||||
|
||||
fireEvent.click(within(settingsNav).getByRole("button", { name: "Voice" }));
|
||||
|
||||
expect(await screen.findByRole("heading", { name: "Voice input" })).toBeInTheDocument();
|
||||
expect(window.location.hash).toBe("#/settings?section=voice");
|
||||
});
|
||||
|
||||
it("opens Apps from the main sidebar without replacing the sidebar", async () => {
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
import { act, render, screen } from "@testing-library/react";
|
||||
import userEvent from "@testing-library/user-event";
|
||||
import { describe, expect, it, vi } from "vitest";
|
||||
|
||||
import { CodeBlock } from "@/components/CodeBlock";
|
||||
@ -87,6 +88,64 @@ describe("CodeBlock", () => {
|
||||
expect(screen.getByText("const value = 1;")).toBeInTheDocument();
|
||||
});
|
||||
|
||||
it("renders ANSI output without mounting the syntax highlighter", () => {
|
||||
render(
|
||||
<ThemeProvider theme="dark">
|
||||
<CodeBlock
|
||||
language="ansi"
|
||||
code={"\x1b[32mPASS\x1b[0m <script>alert(1)</script>"}
|
||||
/>
|
||||
</ThemeProvider>,
|
||||
);
|
||||
|
||||
expect(screen.queryByTestId("highlighted-code")).not.toBeInTheDocument();
|
||||
expect(screen.getByTestId("ansi-code")).toBeInTheDocument();
|
||||
expect(screen.getByTestId("ansi-code").closest(".not-prose")).toBeTruthy();
|
||||
expect(screen.getByText("ansi")).toBeInTheDocument();
|
||||
expect(screen.getByText("PASS")).toHaveStyle({ color: "#0dbc79" });
|
||||
expect(screen.getByText("<script>alert(1)</script>")).toBeInTheDocument();
|
||||
expect(document.querySelector("script")).toBeNull();
|
||||
});
|
||||
|
||||
it("detects ANSI sequences in regular code blocks", () => {
|
||||
render(
|
||||
<ThemeProvider theme="light">
|
||||
<CodeBlock
|
||||
language="text"
|
||||
code={"\x1b[38;2;35;209;139mtruecolor\x1b[0m"}
|
||||
/>
|
||||
</ThemeProvider>,
|
||||
);
|
||||
|
||||
expect(screen.queryByTestId("highlighted-code")).not.toBeInTheDocument();
|
||||
expect(screen.getByText("truecolor")).toHaveStyle({
|
||||
color: "rgb(35, 209, 139)",
|
||||
});
|
||||
});
|
||||
|
||||
it("copies ANSI output as clean text", async () => {
|
||||
const user = userEvent.setup();
|
||||
const writeText = vi.fn().mockResolvedValue(undefined);
|
||||
Object.defineProperty(navigator, "clipboard", {
|
||||
configurable: true,
|
||||
value: { writeText },
|
||||
});
|
||||
|
||||
try {
|
||||
render(
|
||||
<ThemeProvider theme="dark">
|
||||
<CodeBlock language="ansi" code={"\x1b[32mPASS\x1b[0m"} />
|
||||
</ThemeProvider>,
|
||||
);
|
||||
|
||||
await user.click(screen.getByRole("button", { name: /copy/i }));
|
||||
|
||||
expect(writeText).toHaveBeenCalledWith("PASS");
|
||||
} finally {
|
||||
Reflect.deleteProperty(navigator, "clipboard");
|
||||
}
|
||||
});
|
||||
|
||||
it("reads theme from context without creating per-block observers", async () => {
|
||||
const originalMutationObserver = globalThis.MutationObserver;
|
||||
const observer = vi.fn();
|
||||
|
||||
@ -412,6 +412,61 @@ describe("NanobotClient", () => {
|
||||
);
|
||||
});
|
||||
|
||||
it("sends transcription requests and resolves transcription results outside chat dispatch", async () => {
|
||||
const client = new NanobotClient({
|
||||
url: "ws://test",
|
||||
reconnect: false,
|
||||
socketFactory: (url) => new FakeSocket(url) as unknown as WebSocket,
|
||||
});
|
||||
const handler = vi.fn();
|
||||
client.onChat("chat-a", handler);
|
||||
client.connect();
|
||||
lastSocket().fakeOpen();
|
||||
|
||||
const promise = client.transcribeAudio("data:audio/webm;base64,AAAA", {
|
||||
durationMs: 1234,
|
||||
timeoutMs: 1_000,
|
||||
});
|
||||
const frame = JSON.parse(lastSocket().sent.at(-1) as string);
|
||||
expect(frame).toMatchObject({
|
||||
type: "transcribe_audio",
|
||||
data_url: "data:audio/webm;base64,AAAA",
|
||||
duration_ms: 1234,
|
||||
});
|
||||
expect(typeof frame.request_id).toBe("string");
|
||||
|
||||
lastSocket().fakeMessage({
|
||||
event: "transcription_result",
|
||||
request_id: frame.request_id,
|
||||
text: "hello from voice",
|
||||
});
|
||||
await expect(promise).resolves.toBe("hello from voice");
|
||||
expect(handler).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("rejects pending transcription requests on server errors and socket close", async () => {
|
||||
const client = new NanobotClient({
|
||||
url: "ws://test",
|
||||
reconnect: false,
|
||||
socketFactory: (url) => new FakeSocket(url) as unknown as WebSocket,
|
||||
});
|
||||
client.connect();
|
||||
lastSocket().fakeOpen();
|
||||
|
||||
const errored = client.transcribeAudio("data:audio/webm;base64,AAAA", { timeoutMs: 1_000 });
|
||||
const errorFrame = JSON.parse(lastSocket().sent.at(-1) as string);
|
||||
lastSocket().fakeMessage({
|
||||
event: "transcription_error",
|
||||
request_id: errorFrame.request_id,
|
||||
detail: "not_configured",
|
||||
});
|
||||
await expect(errored).rejects.toThrow("not_configured");
|
||||
|
||||
const dropped = client.transcribeAudio("data:audio/webm;base64,BBBB", { timeoutMs: 1_000 });
|
||||
lastSocket().close();
|
||||
await expect(dropped).rejects.toThrow("socket closed");
|
||||
});
|
||||
|
||||
it("queues sends while connecting and flushes on open", () => {
|
||||
const client = new NanobotClient({
|
||||
url: "ws://test",
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
import { fireEvent, render, screen, waitFor, within } from "@testing-library/react";
|
||||
import { act, fireEvent, render, screen, waitFor, within } from "@testing-library/react";
|
||||
import { afterEach, describe, expect, it, vi } from "vitest";
|
||||
|
||||
import { ThreadComposer } from "@/components/thread/ThreadComposer";
|
||||
@ -121,6 +121,7 @@ const MCP_PRESETS: McpPresetInfo[] = [
|
||||
},
|
||||
];
|
||||
const ORIGINAL_INNER_HEIGHT = window.innerHeight;
|
||||
const ORIGINAL_MEDIA_DEVICES = navigator.mediaDevices;
|
||||
|
||||
function mockBlobUrls() {
|
||||
Object.defineProperty(URL, "createObjectURL", {
|
||||
@ -135,7 +136,16 @@ function mockBlobUrls() {
|
||||
|
||||
afterEach(() => {
|
||||
vi.restoreAllMocks();
|
||||
vi.unstubAllGlobals();
|
||||
Reflect.deleteProperty(window, "nanobotHost");
|
||||
if (ORIGINAL_MEDIA_DEVICES) {
|
||||
Object.defineProperty(navigator, "mediaDevices", {
|
||||
configurable: true,
|
||||
value: ORIGINAL_MEDIA_DEVICES,
|
||||
});
|
||||
} else {
|
||||
Reflect.deleteProperty(navigator, "mediaDevices");
|
||||
}
|
||||
window.localStorage.clear();
|
||||
Object.defineProperty(window, "innerHeight", {
|
||||
value: ORIGINAL_INNER_HEIGHT,
|
||||
@ -161,6 +171,75 @@ function rect(init: Partial<DOMRect>): DOMRect {
|
||||
};
|
||||
}
|
||||
|
||||
function mockVoiceRecorder(blob = new Blob(["voice"], { type: "audio/webm" })) {
|
||||
const stopTrack = vi.fn();
|
||||
const getUserMedia = vi.fn(async () => ({
|
||||
getTracks: () => [{ stop: stopTrack }],
|
||||
}));
|
||||
Object.defineProperty(navigator, "mediaDevices", {
|
||||
configurable: true,
|
||||
value: { getUserMedia },
|
||||
});
|
||||
|
||||
class FakeMediaRecorder {
|
||||
static isTypeSupported = vi.fn((type: string) => type === "audio/webm");
|
||||
|
||||
state: RecordingState = "inactive";
|
||||
mimeType = blob.type;
|
||||
ondataavailable: ((event: BlobEvent) => void) | null = null;
|
||||
onstop: (() => void) | null = null;
|
||||
|
||||
start() {
|
||||
this.state = "recording";
|
||||
}
|
||||
|
||||
stop() {
|
||||
this.state = "inactive";
|
||||
this.ondataavailable?.({ data: blob } as BlobEvent);
|
||||
this.onstop?.();
|
||||
}
|
||||
}
|
||||
|
||||
vi.stubGlobal("MediaRecorder", FakeMediaRecorder);
|
||||
return { getUserMedia, stopTrack };
|
||||
}
|
||||
|
||||
function mockVoiceAudioInput(sample = 128, state: AudioContextState = "running") {
|
||||
class FakeAudioContext {
|
||||
state = state;
|
||||
|
||||
createMediaStreamSource() {
|
||||
return { connect: vi.fn(), disconnect: vi.fn() };
|
||||
}
|
||||
|
||||
createAnalyser() {
|
||||
return {
|
||||
fftSize: 256,
|
||||
smoothingTimeConstant: 0,
|
||||
disconnect: vi.fn(),
|
||||
getByteTimeDomainData: (data: Uint8Array) => data.fill(sample),
|
||||
};
|
||||
}
|
||||
|
||||
close = vi.fn(async () => undefined);
|
||||
resume = vi.fn(async () => undefined);
|
||||
}
|
||||
|
||||
vi.stubGlobal("AudioContext", FakeAudioContext);
|
||||
vi.spyOn(window, "requestAnimationFrame").mockImplementation((callback) =>
|
||||
window.setTimeout(() => callback(performance.now()), 16) as unknown as number
|
||||
);
|
||||
vi.spyOn(window, "cancelAnimationFrame").mockImplementation((id) =>
|
||||
window.clearTimeout(id as unknown as number)
|
||||
);
|
||||
}
|
||||
|
||||
async function waitForVoiceCapture(): Promise<void> {
|
||||
await act(async () => {
|
||||
await new Promise((resolve) => setTimeout(resolve, 700));
|
||||
});
|
||||
}
|
||||
|
||||
describe("ThreadComposer", () => {
|
||||
it("renders a readonly hero model composer when provided", () => {
|
||||
render(
|
||||
@ -209,6 +288,245 @@ describe("ThreadComposer", () => {
|
||||
expect(screen.queryByText(/Enter to send/)).not.toBeInTheDocument();
|
||||
});
|
||||
|
||||
it("transcribes voice input into the composer without sending", async () => {
|
||||
mockVoiceRecorder();
|
||||
const onSend = vi.fn();
|
||||
const onTranscribeAudio = vi.fn(async () => "hello voice");
|
||||
render(
|
||||
<ThreadComposer
|
||||
onSend={onSend}
|
||||
onTranscribeAudio={onTranscribeAudio}
|
||||
placeholder="Type your message..."
|
||||
/>,
|
||||
);
|
||||
|
||||
fireEvent.click(screen.getByRole("button", { name: "Voice input" }));
|
||||
expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
|
||||
await waitForVoiceCapture();
|
||||
fireEvent.click(await screen.findByRole("button", { name: "Stop recording" }));
|
||||
|
||||
await waitFor(() => expect(onTranscribeAudio).toHaveBeenCalledWith(
|
||||
expect.stringMatching(/^data:audio\/webm;base64,/),
|
||||
expect.objectContaining({ durationMs: expect.any(Number) }),
|
||||
));
|
||||
await waitFor(() => expect(screen.getByLabelText("Message input")).toHaveValue("hello voice"));
|
||||
expect(onSend).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("does not start duplicate voice recordings while microphone access is pending", async () => {
|
||||
const { getUserMedia, stopTrack } = mockVoiceRecorder();
|
||||
let resolveStream: ((stream: MediaStream) => void) | undefined;
|
||||
getUserMedia.mockImplementation(() => new Promise((resolve) => {
|
||||
resolveStream = resolve as (stream: MediaStream) => void;
|
||||
}));
|
||||
const onTranscribeAudio = vi.fn(async () => "one recording");
|
||||
render(
|
||||
<ThreadComposer
|
||||
onSend={vi.fn()}
|
||||
onTranscribeAudio={onTranscribeAudio}
|
||||
placeholder="Type your message..."
|
||||
/>,
|
||||
);
|
||||
|
||||
const voiceButton = screen.getByRole("button", { name: "Voice input" });
|
||||
fireEvent.click(voiceButton);
|
||||
fireEvent.click(voiceButton);
|
||||
|
||||
expect(getUserMedia).toHaveBeenCalledTimes(1);
|
||||
|
||||
await act(async () => {
|
||||
resolveStream?.({ getTracks: () => [{ stop: stopTrack }] } as unknown as MediaStream);
|
||||
});
|
||||
expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
|
||||
await waitForVoiceCapture();
|
||||
fireEvent.click(await screen.findByRole("button", { name: "Stop recording" }));
|
||||
|
||||
await waitFor(() => expect(onTranscribeAudio).toHaveBeenCalledTimes(1));
|
||||
await waitFor(() => expect(screen.getByLabelText("Message input")).toHaveValue("one recording"));
|
||||
});
|
||||
|
||||
it("supports press-and-hold voice recording", async () => {
|
||||
mockVoiceRecorder();
|
||||
const onSend = vi.fn();
|
||||
const onTranscribeAudio = vi.fn(async () => "held voice");
|
||||
render(
|
||||
<ThreadComposer
|
||||
onSend={onSend}
|
||||
onTranscribeAudio={onTranscribeAudio}
|
||||
placeholder="Type your message..."
|
||||
/>,
|
||||
);
|
||||
|
||||
const voiceButton = screen.getByRole("button", { name: "Voice input" });
|
||||
fireEvent.pointerDown(voiceButton, { button: 0, pointerId: 1, pointerType: "touch" });
|
||||
await act(async () => {
|
||||
await new Promise((resolve) => setTimeout(resolve, 180));
|
||||
});
|
||||
expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
|
||||
await waitForVoiceCapture();
|
||||
fireEvent.pointerUp(screen.getByRole("button", { name: "Stop recording" }), {
|
||||
pointerId: 1,
|
||||
pointerType: "touch",
|
||||
});
|
||||
|
||||
await waitFor(() => expect(onTranscribeAudio).toHaveBeenCalled());
|
||||
await waitFor(() => expect(screen.getByLabelText("Message input")).toHaveValue("held voice"));
|
||||
expect(onSend).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("supports keyboard hold voice recording", async () => {
|
||||
mockVoiceRecorder();
|
||||
const onSend = vi.fn();
|
||||
const onTranscribeAudio = vi.fn(async () => "shortcut voice");
|
||||
render(
|
||||
<ThreadComposer
|
||||
onSend={onSend}
|
||||
onTranscribeAudio={onTranscribeAudio}
|
||||
placeholder="Type your message..."
|
||||
/>,
|
||||
);
|
||||
|
||||
const voiceButton = screen.getByRole("button", { name: "Voice input" });
|
||||
expect(voiceButton).toHaveAttribute("title", "Click to dictate or hold");
|
||||
expect(voiceButton).toHaveAttribute("aria-keyshortcuts", "Control+Shift+D");
|
||||
fireEvent.keyDown(window, { code: "KeyD", ctrlKey: true, key: "D", shiftKey: true });
|
||||
expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
|
||||
await waitForVoiceCapture();
|
||||
fireEvent.keyUp(window, { code: "KeyD", ctrlKey: true, key: "D", shiftKey: true });
|
||||
|
||||
await waitFor(() => expect(onTranscribeAudio).toHaveBeenCalled());
|
||||
await waitFor(() => expect(screen.getByLabelText("Message input")).toHaveValue("shortcut voice"));
|
||||
expect(onSend).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("ignores the delayed click emitted after a long-press voice recording", async () => {
|
||||
const { getUserMedia } = mockVoiceRecorder();
|
||||
const onTranscribeAudio = vi.fn(async () => "held once");
|
||||
render(
|
||||
<ThreadComposer
|
||||
onSend={vi.fn()}
|
||||
onTranscribeAudio={onTranscribeAudio}
|
||||
placeholder="Type your message..."
|
||||
/>,
|
||||
);
|
||||
|
||||
const voiceButton = screen.getByRole("button", { name: "Voice input" });
|
||||
fireEvent.pointerDown(voiceButton, { button: 0, pointerId: 1, pointerType: "touch" });
|
||||
await act(async () => {
|
||||
await new Promise((resolve) => setTimeout(resolve, 180));
|
||||
});
|
||||
expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
|
||||
await waitForVoiceCapture();
|
||||
fireEvent.pointerUp(screen.getByRole("button", { name: "Stop recording" }), {
|
||||
pointerId: 1,
|
||||
pointerType: "touch",
|
||||
});
|
||||
await waitFor(() => expect(screen.getByLabelText("Message input")).toHaveValue("held once"));
|
||||
|
||||
await act(async () => {
|
||||
await new Promise((resolve) => setTimeout(resolve, 20));
|
||||
});
|
||||
fireEvent.click(screen.getByRole("button", { name: "Voice input" }));
|
||||
|
||||
expect(getUserMedia).toHaveBeenCalledTimes(1);
|
||||
expect(onTranscribeAudio).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("keeps existing text when voice transcription fails", async () => {
|
||||
mockVoiceRecorder();
|
||||
const onSend = vi.fn();
|
||||
const onTranscribeAudio = vi.fn(async () => {
|
||||
throw new Error("not_configured");
|
||||
});
|
||||
render(
|
||||
<ThreadComposer
|
||||
onSend={onSend}
|
||||
onTranscribeAudio={onTranscribeAudio}
|
||||
placeholder="Type your message..."
|
||||
/>,
|
||||
);
|
||||
|
||||
const input = screen.getByLabelText("Message input");
|
||||
fireEvent.change(input, { target: { value: "draft" } });
|
||||
fireEvent.click(screen.getByRole("button", { name: "Voice input" }));
|
||||
await waitForVoiceCapture();
|
||||
fireEvent.click(await screen.findByRole("button", { name: "Stop recording" }));
|
||||
|
||||
await waitFor(() => {
|
||||
expect(screen.getByText("Configure a transcription provider first.")).toBeInTheDocument();
|
||||
});
|
||||
expect(input).toHaveValue("draft");
|
||||
expect(onSend).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("does not transcribe recordings that are too short", async () => {
|
||||
mockVoiceRecorder();
|
||||
const onTranscribeAudio = vi.fn(async () => "should not appear");
|
||||
render(
|
||||
<ThreadComposer
|
||||
onSend={vi.fn()}
|
||||
onTranscribeAudio={onTranscribeAudio}
|
||||
placeholder="Type your message..."
|
||||
/>,
|
||||
);
|
||||
|
||||
fireEvent.click(screen.getByRole("button", { name: "Voice input" }));
|
||||
fireEvent.click(await screen.findByRole("button", { name: "Stop recording" }));
|
||||
|
||||
await waitFor(() => {
|
||||
expect(screen.getByText("Hold a little longer to record voice.")).toBeInTheDocument();
|
||||
});
|
||||
expect(onTranscribeAudio).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("warns during recording when microphone input is silent", async () => {
|
||||
mockVoiceRecorder();
|
||||
mockVoiceAudioInput();
|
||||
const onTranscribeAudio = vi.fn(async () => "should not appear");
|
||||
render(
|
||||
<ThreadComposer
|
||||
onSend={vi.fn()}
|
||||
onTranscribeAudio={onTranscribeAudio}
|
||||
placeholder="Type your message..."
|
||||
/>,
|
||||
);
|
||||
|
||||
fireEvent.click(screen.getByRole("button", { name: "Voice input" }));
|
||||
expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
|
||||
await act(async () => {
|
||||
await new Promise((resolve) => setTimeout(resolve, 1_150));
|
||||
});
|
||||
|
||||
expect(screen.getByText("No microphone input detected.")).toBeInTheDocument();
|
||||
fireEvent.click(await screen.findByRole("button", { name: "Stop recording" }));
|
||||
expect(onTranscribeAudio).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("does not treat unavailable microphone levels as silence", async () => {
|
||||
mockVoiceRecorder();
|
||||
mockVoiceAudioInput(128, "suspended");
|
||||
const onTranscribeAudio = vi.fn(async () => "voice text");
|
||||
render(
|
||||
<ThreadComposer
|
||||
onSend={vi.fn()}
|
||||
onTranscribeAudio={onTranscribeAudio}
|
||||
placeholder="Type your message..."
|
||||
/>,
|
||||
);
|
||||
|
||||
fireEvent.click(screen.getByRole("button", { name: "Voice input" }));
|
||||
expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
|
||||
await act(async () => {
|
||||
await new Promise((resolve) => setTimeout(resolve, 1_150));
|
||||
});
|
||||
|
||||
expect(screen.queryByText("No microphone input detected.")).not.toBeInTheDocument();
|
||||
fireEvent.click(await screen.findByRole("button", { name: "Stop recording" }));
|
||||
|
||||
await waitFor(() => expect(onTranscribeAudio).toHaveBeenCalledTimes(1));
|
||||
expect(screen.getByDisplayValue("voice text")).toBeInTheDocument();
|
||||
});
|
||||
|
||||
it("renders and changes workspace access mode", async () => {
|
||||
const onWorkspaceScopeChange = vi.fn();
|
||||
render(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user