feat(transcription): add shared voice input support (#4232)

* feat(webui): add voice transcription input * feat(webui): render ANSI output in code blocks * refactor(webui): isolate voice recorder logic * refactor(transcription): keep websocket ingress thin * refactor(transcription): resolve channel audio settings on demand * style(webui): neutralize voice waveform color * feat(webui): add voice input tooltip * feat(webui): add voice input keyboard shortcut * fix(webui): distinguish voice shortcut platforms * fix(webui): place voice button after model selector * refactor(webui): share voice hold recording helpers * fix(desktop): allow microphone voice input * fix(webui): stabilize token usage month labels * feat(webui): show voice input on settings overview * fix(webui): label voice capability as recognition * fix(webui): align capability overview status * refactor(webui): isolate transcription socket handling * fix(webui): soften silent voice waveform * refactor(audio): clarify transcription service location * docs(transcription): clarify audio and provider boundaries * fix(exec): reduce session output polling flake
2026-06-15 07:14:08 +00:00 · 2026-06-09 01:08:49 +08:00 · 2026-06-09 01:08:49 +08:00 · 9c81280300
commit 9c81280300
parent 06d454a225
49 changed files with 3071 additions and 257 deletions
--- a/desktop/package.json
+++ b/desktop/package.json
@ -47,6 +47,9 @@
    ],
    "mac": {
      "category": "public.app-category.developer-tools",
+      "extendInfo": {
+        "NSMicrophoneUsageDescription": "nanobot uses the microphone to transcribe voice input before you send messages."
+      },
      "target": [
        "dmg"
      ]
--- a/desktop/src/main.ts
+++ b/desktop/src/main.ts
@ -15,6 +15,7 @@ import {
  protocol,
  session,
  shell,
+  systemPreferences,
 } from "electron";
 import type { IpcMainInvokeEvent, WebContents } from "electron";

@ -100,6 +101,58 @@ function isTrustedAppUrl(rawUrl: string): boolean {
  }
 }

+function isTrustedPermissionRequest(
+  webContents: WebContents | null,
+  details: unknown,
+): boolean {
+  return [
+    permissionDetail(details, "requestingUrl"),
+    permissionDetail(details, "securityOrigin"),
+    webContents?.getURL(),
+  ].some((url) => typeof url === "string" && isTrustedAppUrl(url));
+}
+
+function permissionDetail(details: unknown, key: string): unknown {
+  return typeof details === "object" && details !== null
+    ? (details as Record<string, unknown>)[key]
+    : undefined;
+}
+
+function isAudioOnlyMediaRequest(details: unknown): boolean {
+  const mediaTypes = permissionDetail(details, "mediaTypes");
+  if (Array.isArray(mediaTypes)) {
+    return mediaTypes.includes("audio") && !mediaTypes.includes("video");
+  }
+  return permissionDetail(details, "mediaType") === "audio";
+}
+
+async function requestNativeMicrophoneAccess(): Promise<boolean> {
+  if (process.platform !== "darwin") return true;
+  const status = systemPreferences.getMediaAccessStatus("microphone");
+  if (status === "granted") return true;
+  if (status === "denied" || status === "restricted") return false;
+  return await systemPreferences.askForMediaAccess("microphone");
+}
+
+function registerPermissionHandlers(): void {
+  session.defaultSession.setPermissionCheckHandler((webContents, permission, _origin, details) => (
+    permission === "media"
+    && isTrustedPermissionRequest(webContents, details)
+    && isAudioOnlyMediaRequest(details)
+  ));
+  session.defaultSession.setPermissionRequestHandler((webContents, permission, callback, details) => {
+    if (
+      permission !== "media"
+      || !isTrustedPermissionRequest(webContents, details)
+      || !isAudioOnlyMediaRequest(details)
+    ) {
+      callback(false);
+      return;
+    }
+    void requestNativeMicrophoneAccess().then(callback, () => callback(false));
+  });
+}
+
 function assertTrustedIpc(event: IpcMainInvokeEvent): void {
  const frameUrl = event.senderFrame?.url || event.sender.getURL();
  if (!isTrustedAppUrl(frameUrl)) {
@ -749,6 +802,7 @@ app.whenReady().then(async () => {
  }

  registerIpcHandlers();
+  registerPermissionHandlers();
  registerAppProtocol(webDist, devUrl);

  mainWindow = createWindow();
--- a/docs/channel-plugin-guide.md
+++ b/docs/channel-plugin-guide.md
@ -234,7 +234,7 @@ nanobot channels login <channel_name> --force  # re-authenticate
 | `_handle_message(sender_id, chat_id, content, media?, metadata?, session_key?)` | **Call this when you receive a message.** Checks `is_allowed()`, then publishes to the bus. Automatically sets `_wants_stream` if `supports_streaming` is true. |
 | `is_allowed(sender_id)` | Checks against `config.allow_from`; `"*"` allows all, `[]` denies all. |
 | `default_config()` (classmethod) | Returns default config dict for `nanobot onboard`. Override to declare your fields. |
-| `transcribe_audio(file_path)` | Transcribes audio via Groq Whisper (if configured). |
+| `transcribe_audio(file_path)` | Transcribes audio via the shared top-level `transcription` config (if configured). |
 | `supports_streaming` (property) | `True` when config has `"streaming": true` **and** subclass overrides `send_delta()`. |
 | `is_running` | Returns `self._running`. |
 | `login(force=False)` | Perform interactive login (e.g. QR code scan). Returns `True` if already authenticated or login succeeds. Override in subclasses that support interactive login. |
--- a/docs/configuration.md
+++ b/docs/configuration.md
@ -119,7 +119,7 @@ ANTHROPIC_API_KEY="$(bw get password api/anthropic)" nanobot agent
 ## Providers

 > [!TIP]
-> - **Voice transcription**: Voice messages (Telegram, WhatsApp) are automatically transcribed using Whisper. By default Groq is used (free tier). Set `"transcriptionProvider": "openai"` under `channels` to use OpenAI Whisper instead, and optionally set `"transcriptionLanguage": "en"` (or another ISO-639-1 code) for more accurate transcription. The API key is picked from the matching provider config.
+> - **Voice transcription**: Voice messages and WebUI/desktop microphone input use the shared top-level `transcription` settings. By default Groq Whisper is used; set `transcription.provider` to `"openai"` to use OpenAI Whisper. API keys still live in the matching `providers.<provider>` config.
 > - **MiniMax Coding Plan**: Exclusive discount links for the nanobot community: [Overseas](https://platform.minimax.io/subscribe/coding-plan?code=9txpdXw04g&source=link) · [Mainland China](https://platform.minimaxi.com/subscribe/token-plan?code=GILTJpMTqZ&source=link)
 > - **MiniMax (Mainland China)**: If your API key is from MiniMax's mainland China platform (minimaxi.com), set `"apiBase": "https://api.minimaxi.com/v1"` in your minimax provider config.
 > - **MiniMax thinking mode**: Use `providers.minimaxAnthropic` when you want `reasoningEffort` / thinking mode. MiniMax exposes that capability through its Anthropic-compatible endpoint, so nanobot keeps it as a separate provider instead of guessing MiniMax-specific thinking parameters on the generic OpenAI-compatible `minimax` endpoint. It uses the same `MINIMAX_API_KEY`. Default Anthropic-compatible base URL: `https://api.minimax.io/anthropic`; for mainland China use `https://api.minimaxi.com/anthropic`.
@ -1100,6 +1100,61 @@ Set `agents.defaults.modelPreset` to start with a named preset:

 When `modelPreset` is `null` or omitted, startup uses the implicit `default` preset from `agents.defaults.*`. Runtime changes made with `/model <preset>` are not written back to `config.json`; they affect future turns until the process restarts or another model/config change replaces them.

+## Transcription Settings
+
+Audio transcription is a shared capability used by chat-channel voice messages and by WebUI/desktop microphone input. Chat-channel voice messages are transcribed automatically before they enter the agent. WebUI and desktop microphone input is transcribed into the composer first, so you can edit the text before sending.
+
+Configure transcription under the top-level `transcription` section:
+
+```json
+{
+  "transcription": {
+    "enabled": true,
+    "provider": "groq",
+    "model": null,
+    "language": null,
+    "maxDurationSec": 120,
+    "maxUploadMb": 25
+  }
+}
+```
+
+| Setting | Default | Description |
+|---------|---------|-------------|
+| `enabled` | `true` | Enables audio transcription for both chat-channel voice messages and WebUI/desktop microphone input. |
+| `provider` | `"groq"` | Transcription backend: `"groq"` or `"openai"`. |
+| `model` | provider default | Optional transcription model override. Defaults to `whisper-large-v3` for Groq and `whisper-1` for OpenAI. |
+| `language` | `null` | Optional ISO-639 language hint, e.g. `"en"`, `"zh"`, `"ko"`, or `"ja"`. |
+| `maxDurationSec` | `120` | Maximum WebUI/desktop recording duration. |
+| `maxUploadMb` | `25` | Maximum WebUI/desktop audio upload size. |
+
+Provider and language resolution is intentionally ordered for backwards compatibility:
+
+1. `transcription.provider` / `transcription.language`
+2. Legacy `channels.transcriptionProvider` / `channels.transcriptionLanguage`
+3. Built-in defaults (`provider: "groq"`, no language hint)
+
+The legacy `channels.*` transcription fields existed before transcription became a shared capability across chat channels and WebUI/desktop microphone input. They are still read so older `config.json` files keep working, but they are no longer the preferred configuration surface. If both old and new fields are present, the top-level `transcription` values are the source of truth.
+
+Transcription credentials are intentionally not stored in `transcription`. Put the API key and optional endpoint in the matching provider config:
+
+```json
+{
+  "providers": {
+    "groq": {
+      "apiKey": "gsk-...",
+      "apiBase": "https://api.groq.com/openai/v1"
+    }
+  },
+  "transcription": {
+    "provider": "groq",
+    "language": "zh"
+  }
+}
+```
+
+Selecting a transcription provider does not configure credentials by itself. For example, the effective provider may default to Groq for compatibility, but transcription is only usable when `providers.groq.apiKey` or the matching environment-backed config is available. The Settings UI writes only the top-level `transcription` fields.
+
 ## Channel Settings

 Global settings that apply to all channels. Configure under the `channels` section in `~/.nanobot/config.json`:
@ -1111,8 +1166,6 @@ Global settings that apply to all channels. Configure under the `channels` secti
    "sendToolHints": false,
    "extractDocumentText": true,
    "sendMaxRetries": 3,
-    "transcriptionProvider": "groq",
-    "transcriptionLanguage": null,
    "telegram": { ... }
  }
 }
@ -1125,8 +1178,8 @@ Global settings that apply to all channels. Configure under the `channels` secti
 | `showReasoning` | `true` | Allow channels to surface model reasoning/thinking content (DeepSeek-R1 `reasoning_content`, Anthropic `thinking_blocks`, inline `<think>` tags). Reasoning flows as a dedicated stream with `_reasoning_delta` / `_reasoning_end` markers — channels override `send_reasoning_delta` / `send_reasoning_end` to render in-place updates. Even with `true`, channels without those overrides stay no-op silently. Currently surfaced on CLI and WebSocket/WebUI (italic shimmer header, auto-collapses after the stream ends); Telegram / Slack / Discord / Feishu / WeChat / Matrix keep the base no-op until their bubble UI is adapted. Independent of `sendProgress`. |
 | `extractDocumentText` | `true` | Extract supported document/text attachments into the model prompt. Set to `false` to keep document content out of the prompt and include attachment path references instead. |
 | `sendMaxRetries` | `3` | Max delivery attempts per outbound message, including the initial send (0-10 configured, minimum 1 actual attempt) |
-| `transcriptionProvider` | `"groq"` | Voice transcription backend: `"groq"` (free tier, default) or `"openai"`. API key and optional `apiBase` are auto-resolved from the matching provider config. Chat-style bases such as `https://api.groq.com/openai/v1` are normalized to the audio transcription endpoint. |
-| `transcriptionLanguage` | `null` | Optional ISO-639-1 language hint for audio transcription, e.g. `"en"`, `"ko"`, `"ja"`. |
+
+`channels.transcriptionProvider` and `channels.transcriptionLanguage` are deprecated compatibility fields. They remain as a read-only fallback for older configs, but new configuration should use top-level `transcription.provider` and `transcription.language`.

 `sendProgress` and `sendToolHints` can also be overridden per channel. The
 global values stay as defaults for channels that do not set their own value:
--- a/nanobot/agent/tools/exec_session.py
+++ b/nanobot/agent/tools/exec_session.py
@ -24,6 +24,7 @@ DEFAULT_WAIT_FOR_MS = 10_000
 MAX_WAIT_FOR_MS = 120_000
 DEFAULT_MAX_OUTPUT_CHARS = 10_000
 MAX_OUTPUT_CHARS = 50_000
+OUTPUT_DRAIN_GRACE_S = 0.1


@dataclass(slots=True)
@ -139,6 +140,8 @@ class _ExecSession:
                    asyncio.gather(self._stdout_task, self._stderr_task),
                    timeout=2.0,
                )
+        elif yield_time_ms > 0:
+            await self._wait_for_buffered_output()

        async with self._lock:
            output = "".join(self._chunks)
@ -163,6 +166,14 @@ class _ExecSession:
        with suppress(asyncio.TimeoutError):
            await asyncio.wait_for(self.process.wait(), timeout=5.0)

+    async def _wait_for_buffered_output(self) -> None:
+        deadline = time.monotonic() + OUTPUT_DRAIN_GRACE_S
+        while time.monotonic() < deadline:
+            async with self._lock:
+                if self._chunks:
+                    return
+            await asyncio.sleep(0.01)
+

 class ExecSessionManager:
    def __init__(self, *, max_sessions: int = 8, idle_timeout: int = 1800) -> None:
--- a/nanobot/audio/init.py
+++ b/nanobot/audio/init.py
@ -0,0 +1,2 @@
+"""Shared audio service helpers."""
+
--- a/nanobot/audio/transcription.py
+++ b/nanobot/audio/transcription.py
@ -0,0 +1,183 @@
+"""Application-level audio transcription service.
+
+This module owns nanobot's transcription behavior: config resolution,
+legacy channel fallback, upload validation, temporary-file handling, and
+dispatch to provider adapters. It deliberately does not know provider-specific
+HTTP details; those live in ``nanobot.providers.transcription``.
+"""
+
+from __future__ import annotations
+
+from contextlib import suppress
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Literal
+
+from loguru import logger
+
+from nanobot.config.paths import get_media_dir
+from nanobot.utils.media_decode import FileSizeExceeded, save_base64_data_url
+
+TranscriptionProviderName = Literal["groq", "openai"]
+
+_DEFAULT_PROVIDER: TranscriptionProviderName = "groq"
+_DEFAULT_MODELS: dict[TranscriptionProviderName, str] = {
+    "groq": "whisper-large-v3",
+    "openai": "whisper-1",
+}
+_MAX_AUDIO_BYTES_FALLBACK = 25 * 1024 * 1024
+_AUDIO_MIME_ALLOWED: frozenset[str] = frozenset({
+    "audio/aac",
+    "audio/flac",
+    "audio/m4a",
+    "audio/mp4",
+    "audio/mpeg",
+    "audio/ogg",
+    "audio/wav",
+    "audio/webm",
+    "audio/x-m4a",
+    "audio/x-wav",
+})
+
+
+@dataclass(frozen=True)
+class EffectiveTranscriptionConfig:
+    enabled: bool
+    provider: TranscriptionProviderName
+    model: str
+    language: str | None
+    api_key: str = field(repr=False)
+    api_base: str
+    max_duration_sec: int
+    max_upload_mb: int
+
+    @property
+    def configured(self) -> bool:
+        return bool(self.api_key)
+
+
+class TranscriptionIngressError(Exception):
+    """Stable transcription upload error surfaced to WebUI clients."""
+
+    def __init__(self, detail: str, **extra: Any):
+        super().__init__(detail)
+        self.detail = detail
+        self.extra = extra
+
+
+def _as_provider(value: Any) -> TranscriptionProviderName | None:
+    if isinstance(value, str):
+        name = value.strip().lower()
+        if name in _DEFAULT_MODELS:
+            return name  # type: ignore[return-value]
+    return None
+
+
+def _provider_config(config: Any, provider: str) -> Any:
+    return getattr(getattr(config, "providers", None), provider, None)
+
+
+def _extract_data_url_mime(url: str) -> str | None:
+    header, _, _ = url.partition(",")
+    if not header.startswith("data:") or ";base64" not in header:
+        return None
+    return header[5:].split(";", 1)[0].strip().lower() or None
+
+
+def resolve_transcription_config(config: Any) -> EffectiveTranscriptionConfig:
+    """Resolve top-level transcription settings with legacy channel fallback."""
+    top = getattr(config, "transcription", None)
+    channels = getattr(config, "channels", None)
+    provider = (
+        _as_provider(getattr(top, "provider", None))
+        or _as_provider(getattr(channels, "transcription_provider", None))
+        or _DEFAULT_PROVIDER
+    )
+    provider_cfg = _provider_config(config, provider)
+    return EffectiveTranscriptionConfig(
+        enabled=bool(getattr(top, "enabled", True)),
+        provider=provider,
+        model=(getattr(top, "model", None) or _DEFAULT_MODELS[provider]).strip(),
+        language=getattr(top, "language", None) or getattr(channels, "transcription_language", None),
+        api_key=getattr(provider_cfg, "api_key", None) or "",
+        api_base=getattr(provider_cfg, "api_base", None) or "",
+        max_duration_sec=int(getattr(top, "max_duration_sec", 120)),
+        max_upload_mb=int(getattr(top, "max_upload_mb", 25)),
+    )
+
+
+async def transcribe_audio_data_url(
+    data_url: Any,
+    config: EffectiveTranscriptionConfig,
+    *,
+    duration_ms: Any = None,
+) -> str:
+    """Validate, persist, transcribe, and remove a WebUI audio data URL."""
+    if not isinstance(data_url, str) or not data_url:
+        raise TranscriptionIngressError("missing_audio")
+    if not config.enabled:
+        raise TranscriptionIngressError("disabled")
+    if not config.configured:
+        raise TranscriptionIngressError("not_configured", provider=config.provider)
+    if (
+        isinstance(duration_ms, (int, float))
+        and duration_ms > (config.max_duration_sec * 1000 + 1000)
+    ):
+        raise TranscriptionIngressError("duration")
+    if _extract_data_url_mime(data_url) not in _AUDIO_MIME_ALLOWED:
+        raise TranscriptionIngressError("mime")
+
+    audio_path: str | None = None
+    max_bytes = max(
+        1,
+        config.max_upload_mb * 1024 * 1024 if config.max_upload_mb else _MAX_AUDIO_BYTES_FALLBACK,
+    )
+    try:
+        audio_path = save_base64_data_url(
+            data_url,
+            get_media_dir("webui-transcription"),
+            max_bytes=max_bytes,
+        )
+    except FileSizeExceeded as exc:
+        raise TranscriptionIngressError("size") from exc
+    except Exception as exc:
+        logger.warning("transcription audio decode failed: {}", exc)
+    if not audio_path:
+        raise TranscriptionIngressError("decode")
+
+    try:
+        text = await transcribe_audio_file(audio_path, config)
+    finally:
+        with suppress(OSError):
+            Path(audio_path).unlink(missing_ok=True)
+    if not text:
+        raise TranscriptionIngressError("empty")
+    return text
+
+
+async def transcribe_audio_file(
+    file_path: str | Path,
+    config: EffectiveTranscriptionConfig,
+) -> str:
+    """Transcribe *file_path* using the already-resolved transcription config."""
+    if not config.enabled or not config.configured:
+        return ""
+    if config.provider == "openai":
+        from nanobot.providers.transcription import OpenAITranscriptionProvider
+
+        provider = OpenAITranscriptionProvider(
+            api_key=config.api_key,
+            api_base=config.api_base or None,
+            language=config.language,
+            model=config.model,
+        )
+    else:
+        from nanobot.providers.transcription import GroqTranscriptionProvider
+
+        provider = GroqTranscriptionProvider(
+            api_key=config.api_key,
+            api_base=config.api_base or None,
+            language=config.language,
+            model=config.model,
+        )
+    return await provider.transcribe(file_path)
--- a/nanobot/channels/base.py
+++ b/nanobot/channels/base.py
@ -28,10 +28,6 @@ class BaseChannel(ABC):

    name: str = "base"
    display_name: str = "Base"
-    transcription_provider: str = "groq"
-    transcription_api_key: str = ""
-    transcription_api_base: str = ""
-    transcription_language: str | None = None
    send_progress: bool = True
    send_tool_hints: bool = False
    show_reasoning: bool = True
@ -51,24 +47,14 @@ class BaseChannel(ABC):

    async def transcribe_audio(self, file_path: str | Path) -> str:
        """Transcribe an audio file via Whisper (OpenAI or Groq). Returns empty string on failure."""
-        if not self.transcription_api_key:
-            return ""
        try:
-            if self.transcription_provider == "openai":
-                from nanobot.providers.transcription import OpenAITranscriptionProvider
-                provider = OpenAITranscriptionProvider(
-                    api_key=self.transcription_api_key,
-                    api_base=self.transcription_api_base or None,
-                    language=self.transcription_language or None,
-                )
-            else:
-                from nanobot.providers.transcription import GroqTranscriptionProvider
-                provider = GroqTranscriptionProvider(
-                    api_key=self.transcription_api_key,
-                    api_base=self.transcription_api_base or None,
-                    language=self.transcription_language or None,
-                )
-            return await provider.transcribe(file_path)
+            from nanobot.audio.transcription import (
+                resolve_transcription_config,
+                transcribe_audio_file,
+            )
+            from nanobot.config.loader import load_config
+
+            return await transcribe_audio_file(file_path, resolve_transcription_config(load_config()))
        except Exception:
            self.logger.exception("Audio transcription failed")
            return ""
--- a/nanobot/channels/manager.py
+++ b/nanobot/channels/manager.py
@ -80,11 +80,6 @@ class ChannelManager:
        """Initialize channels discovered via pkgutil scan + entry_points plugins."""
        from nanobot.channels.registry import discover_channel_names, discover_enabled

-        transcription_provider = self.config.channels.transcription_provider
-        transcription_key = self._resolve_transcription_key(transcription_provider)
-        transcription_base = self._resolve_transcription_base(transcription_provider)
-        transcription_language = self.config.channels.transcription_language
-
        # Collect enabled module names first, then only import those.
        # Channel configs live in ChannelsConfig's extra fields (via
        # extra="allow"), so we enumerate candidates from pkgutil scan
@ -135,10 +130,6 @@ class ChannelManager:
                    )
                    kwargs["gateway"] = gateway
                channel = cls(section, self.bus, **kwargs)
-                channel.transcription_provider = transcription_provider
-                channel.transcription_api_key = transcription_key
-                channel.transcription_api_base = transcription_base
-                channel.transcription_language = transcription_language
                channel.send_progress = self._resolve_bool_override(
                    section, "send_progress", self.config.channels.send_progress,
                )
@ -155,24 +146,6 @@ class ChannelManager:

        self._validate_allow_from()

-    def _resolve_transcription_key(self, provider: str) -> str:
-        """Pick the API key for the configured transcription provider."""
-        try:
-            if provider == "openai":
-                return self.config.providers.openai.api_key
-            return self.config.providers.groq.api_key
-        except AttributeError:
-            return ""
-
-    def _resolve_transcription_base(self, provider: str) -> str:
-        """Pick the API base URL for the configured transcription provider."""
-        try:
-            if provider == "openai":
-                return self.config.providers.openai.api_base or ""
-            return self.config.providers.groq.api_base or ""
-        except AttributeError:
-            return ""
-
    def _validate_allow_from(self) -> None:
        for name, ch in self.channels.items():
            cfg = ch.config
--- a/nanobot/channels/websocket.py
+++ b/nanobot/channels/websocket.py
@ -45,6 +45,7 @@ from nanobot.webui.http_utils import (
    query_first as _query_first,
 )
 from nanobot.webui.mcp_presets_api import normalize_mcp_preset_mentions
+from nanobot.webui.transcription_ws import webui_transcription_event
 from nanobot.webui.websocket_logging import websockets_server_logger


@ -235,7 +236,7 @@ _VIDEO_MIME_ALLOWED: frozenset[str] = frozenset({

 _UPLOAD_MIME_ALLOWED: frozenset[str] = _IMAGE_MIME_ALLOWED | _VIDEO_MIME_ALLOWED

-_DATA_URL_MIME_RE = re.compile(r"^data:([^;]+);base64,", re.DOTALL)
+_DATA_URL_MIME_RE = re.compile(r"^data:([^;,]+)(?:;[^,]*)*;base64,", re.DOTALL)


 def _extract_data_url_mime(url: str) -> str | None:
@ -419,7 +420,6 @@ class WebSocketChannel(BaseChannel):
        return None

    # -- Server lifecycle and connection ingress ---------------------------
-    # -- Server lifecycle and connection ingress ---------------------------

    async def start(self) -> None:
        from nanobot.utils.logging_bridge import redirect_lib_logging
@ -703,6 +703,10 @@ class WebSocketChannel(BaseChannel):
                workspace_scope=scope.payload(),
            )
            return
+        if t == "transcribe_audio":
+            event, payload = await webui_transcription_event(envelope)
+            await self._send_event(connection, event, **payload)
+            return
        if t == "message":
            cid = envelope.get("chat_id")
            content = envelope.get("content")
--- a/nanobot/config/schema.py
+++ b/nanobot/config/schema.py
@ -39,8 +39,19 @@ class ChannelsConfig(Base):
    show_reasoning: bool = True  # surface model reasoning when channel implements it
    extract_document_text: bool = True  # extract text from document attachments before sending to the model
    send_max_retries: int = Field(default=3, ge=0, le=10)  # Max delivery attempts (initial send included)
-    transcription_provider: str = "groq"  # Voice transcription backend: "groq" or "openai"
-    transcription_language: str | None = Field(default=None, pattern=r"^[a-z]{2,3}$")  # Optional ISO-639-1 hint for audio transcription
+    transcription_provider: str = "groq"  # Deprecated: use top-level transcription.provider
+    transcription_language: str | None = Field(default=None, pattern=r"^[a-z]{2,3}$")  # Deprecated: use top-level transcription.language
+
+
+class TranscriptionConfig(Base):
+    """Cross-channel audio transcription configuration."""
+
+    enabled: bool = True
+    provider: Literal["groq", "openai"] | None = None
+    model: str | None = None
+    language: str | None = Field(default=None, pattern=r"^[a-z]{2,3}$")
+    max_duration_sec: int = Field(default=120, ge=1, le=600)
+    max_upload_mb: int = Field(default=25, ge=1, le=100)


 class DreamConfig(Base):
@ -167,7 +178,7 @@ class AgentsConfig(Base):
 class ProviderConfig(Base):
    """LLM provider configuration."""

-    api_key: str | None = None
+    api_key: str | None = Field(default=None, repr=False)
    api_base: str | None = None
    api_type: Literal["auto", "chat_completions", "responses"] = "auto"  # Request API surface
    extra_headers: dict[str, str] | None = None  # Custom headers (e.g. APP-Code for AiHubMix)
@ -312,6 +323,7 @@ class Config(BaseSettings):

    agents: AgentsConfig = Field(default_factory=AgentsConfig)
    channels: ChannelsConfig = Field(default_factory=ChannelsConfig)
+    transcription: TranscriptionConfig = Field(default_factory=TranscriptionConfig)
    providers: ProvidersConfig = Field(default_factory=ProvidersConfig)
    api: ApiConfig = Field(default_factory=ApiConfig)
    gateway: GatewayConfig = Field(default_factory=GatewayConfig)
--- a/nanobot/providers/transcription.py
+++ b/nanobot/providers/transcription.py
@ -1,6 +1,12 @@
-"""Voice transcription providers (Groq and OpenAI Whisper)."""
+"""Provider-specific voice transcription adapters.
+
+This module only knows how to call external transcription APIs such as Groq
+and OpenAI Whisper. Product-level config fallback, WebUI upload validation,
+and channel integration live in ``nanobot.audio.transcription``.
+"""

 import asyncio
+import mimetypes
 import os
 from pathlib import Path

@ -8,6 +14,15 @@ import httpx
 from loguru import logger

 _TRANSCRIPTIONS_PATH = "audio/transcriptions"
+_AUDIO_MIME_OVERRIDES = {
+    ".m4a": "audio/mp4",
+    ".mpga": "audio/mpeg",
+    ".ogg": "audio/ogg",
+    ".opus": "audio/ogg",
+    ".wav": "audio/wav",
+    ".weba": "audio/webm",
+    ".webm": "audio/webm",
+}


 def _resolve_transcription_url(api_base: str | None, default_url: str) -> str:
@ -26,6 +41,14 @@ def _resolve_transcription_url(api_base: str | None, default_url: str) -> str:
    return f"{base}/{_TRANSCRIPTIONS_PATH}"


+def _audio_mime_type(path: Path) -> str:
+    return (
+        _AUDIO_MIME_OVERRIDES.get(path.suffix.lower())
+        or mimetypes.guess_type(path.name)[0]
+        or "application/octet-stream"
+    )
+
+
 # Up to 3 retries (4 attempts total) with exponential backoff on transient
 # failures. Whisper endpoints occasionally return 502/503 under load, and
 # mobile-network transcription callers hit sporadic connect/read errors.
@ -71,7 +94,7 @@ async def _post_transcription_with_retry(
    async with httpx.AsyncClient() as client:
        for attempt in range(_MAX_RETRIES + 1):
            files = {
-                "file": (path.name, data),
+                "file": (path.name, data, _audio_mime_type(path)),
                "model": (None, model),
            }
            if language:
@ -113,6 +136,16 @@ async def _post_transcription_with_retry(

            try:
                response.raise_for_status()
+            except httpx.HTTPStatusError:
+                body = response.text.strip().replace("\n", " ")[:500]
+                logger.error(
+                    "{} transcription HTTP {}{}{}",
+                    provider_label,
+                    response.status_code,
+                    f" {response.reason_phrase}" if response.reason_phrase else "",
+                    f": {body}" if body else "",
+                )
+                return ""
            except Exception as e:
                logger.exception("{} transcription error: {}", provider_label, e)
                return ""
@ -144,6 +177,7 @@ class OpenAITranscriptionProvider:
        api_key: str | None = None,
        api_base: str | None = None,
        language: str | None = None,
+        model: str | None = None,
    ):
        self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
        self.api_url = _resolve_transcription_url(
@ -151,6 +185,7 @@ class OpenAITranscriptionProvider:
            "https://api.openai.com/v1/audio/transcriptions",
        )
        self.language = language or None
+        self.model = model or "whisper-1"
        logger.debug("OpenAI transcription endpoint: {}", self.api_url)

    async def transcribe(self, file_path: str | Path) -> str:
@ -165,7 +200,7 @@ class OpenAITranscriptionProvider:
            self.api_url,
            api_key=self.api_key,
            path=path,
-            model="whisper-1",
+            model=self.model,
            provider_label="OpenAI",
            language=self.language,
        )
@ -183,6 +218,7 @@ class GroqTranscriptionProvider:
        api_key: str | None = None,
        api_base: str | None = None,
        language: str | None = None,
+        model: str | None = None,
    ):
        self.api_key = api_key or os.environ.get("GROQ_API_KEY")
        self.api_url = _resolve_transcription_url(
@ -190,6 +226,7 @@ class GroqTranscriptionProvider:
            "https://api.groq.com/openai/v1/audio/transcriptions",
        )
        self.language = language or None
+        self.model = model or "whisper-large-v3"
        logger.debug("Groq transcription endpoint: {}", self.api_url)

    async def transcribe(self, file_path: str | Path) -> str:
@ -215,7 +252,7 @@ class GroqTranscriptionProvider:
            self.api_url,
            api_key=self.api_key,
            path=path,
-            model="whisper-large-v3",
+            model=self.model,
            provider_label="Groq",
            language=self.language,
        )
--- a/nanobot/utils/media_decode.py
+++ b/nanobot/utils/media_decode.py
@ -18,13 +18,30 @@ from nanobot.utils.helpers import safe_filename
 DEFAULT_MAX_BYTES = 10 * 1024 * 1024
 MAX_FILE_SIZE = DEFAULT_MAX_BYTES

-_DATA_URL_RE = re.compile(r"^data:([^;]+);base64,(.+)$", re.DOTALL)
+_DATA_URL_RE = re.compile(r"^data:([^;,]+)(?:;[^,]*)*;base64,(.+)$", re.DOTALL)
+_MIME_EXTENSION_OVERRIDES = {
+    # Python's ``mimetypes`` maps browser-recorded audio/webm to ``.weba`` and
+    # audio/ogg to ``.oga`` on macOS. Some transcription APIs validate by the
+    # file extension and accept the canonical container extensions instead.
+    "application/ogg": ".ogg",
+    "audio/ogg": ".ogg",
+    "audio/mpga": ".mpga",
+    "audio/wav": ".wav",
+    "audio/webm": ".webm",
+    "audio/x-m4a": ".m4a",
+    "audio/x-wav": ".wav",
+    "audio/vnd.wave": ".wav",
+    "video/webm": ".webm",
+}


-class FileSizeExceeded(Exception):
+class FileSizeExceededError(Exception):
    """Raised when a decoded payload exceeds the caller's size limit."""


+FileSizeExceeded = FileSizeExceededError
+
+
 def save_base64_data_url(
    data_url: str,
    media_dir: Path,
@ -40,7 +57,7 @@ def save_base64_data_url(
    m = _DATA_URL_RE.match(data_url)
    if not m:
        return None
-    mime_type, b64_payload = m.group(1), m.group(2)
+    mime_type, b64_payload = m.group(1).strip().lower(), m.group(2)
    try:
        raw = base64.b64decode(b64_payload)
    except Exception:
@ -48,7 +65,7 @@ def save_base64_data_url(
    limit = DEFAULT_MAX_BYTES if max_bytes is None else max_bytes
    if len(raw) > limit:
        raise FileSizeExceeded(f"File exceeds {limit // (1024 * 1024)}MB limit")
-    ext = mimetypes.guess_extension(mime_type) or ".bin"
+    ext = _MIME_EXTENSION_OVERRIDES.get(mime_type) or mimetypes.guess_extension(mime_type) or ".bin"
    filename = f"{uuid.uuid4().hex[:12]}{ext}"
    dest = media_dir / safe_filename(filename)
    dest.write_bytes(raw)
--- a/nanobot/webui/settings_api.py
+++ b/nanobot/webui/settings_api.py
@ -15,6 +15,7 @@ from zoneinfo import ZoneInfo

 import httpx

+from nanobot.audio.transcription import resolve_transcription_config
 from nanobot.config.loader import get_config_path, load_config, save_config
 from nanobot.config.schema import ModelPresetConfig
 from nanobot.providers.image_generation import (
@ -90,6 +91,7 @@ _IMAGE_GENERATION_ASPECT_RATIOS = {
    "2:3",
    "21:9",
 }
+_TRANSCRIPTION_PROVIDERS = ("groq", "openai")
 _CONTEXT_WINDOW_TOKEN_OPTIONS = {65_536, 262_144}
 _MODEL_CONFIGURATION_SLUG_RE = re.compile(r"[^a-z0-9_-]+")
 _ENV_REF_RE = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}")
@ -576,6 +578,22 @@ def _image_generation_provider_rows(config: Any) -> list[dict[str, Any]]:
    return rows


+def _transcription_provider_rows(config: Any) -> list[dict[str, Any]]:
+    rows: list[dict[str, Any]] = []
+    for name in _TRANSCRIPTION_PROVIDERS:
+        spec = find_by_name(name)
+        provider_config = getattr(config.providers, name, None)
+        rows.append({
+            "name": name,
+            "label": spec.label if spec is not None else name,
+            "configured": bool(getattr(provider_config, "api_key", None)),
+            "api_key_hint": _mask_secret_hint(getattr(provider_config, "api_key", None)),
+            "api_base": getattr(provider_config, "api_base", None),
+            "default_api_base": spec.default_api_base if spec and spec.default_api_base else None,
+        })
+    return rows
+
+
 def settings_payload(
    *,
    requires_restart: bool = False,
@ -633,6 +651,7 @@ def settings_payload(

    search_config = config.tools.web.search
    image_config = config.tools.image_generation
+    transcription = resolve_transcription_config(config)
    search_provider = (
        search_config.provider
        if search_config.provider in _WEB_SEARCH_PROVIDER_BY_NAME
@ -733,6 +752,16 @@ def settings_payload(
            "save_dir": image_config.save_dir,
            "providers": image_providers,
        },
+        "transcription": {
+            "enabled": transcription.enabled,
+            "provider": transcription.provider,
+            "provider_configured": transcription.configured,
+            "model": transcription.model,
+            "language": transcription.language,
+            "max_duration_sec": transcription.max_duration_sec,
+            "max_upload_mb": transcription.max_upload_mb,
+            "providers": _transcription_provider_rows(config),
+        },
        "runtime": {
            "config_path": str(get_config_path().expanduser()),
            "workspace_path": str(config.workspace_path),
@ -1311,3 +1340,71 @@ def update_image_generation_settings(query: QueryParams) -> dict[str, Any]:
    if changed:
        save_config(config)
    return settings_payload(requires_restart=changed)
+
+
+def update_transcription_settings(query: QueryParams) -> dict[str, Any]:
+    config = load_config()
+    transcription = config.transcription
+    changed = False
+
+    enabled = _query_first(query, "enabled")
+    if enabled is not None:
+        parsed_enabled = _parse_bool(enabled, "enabled")
+        if transcription.enabled != parsed_enabled:
+            transcription.enabled = parsed_enabled
+            changed = True
+
+    provider = _query_first(query, "provider")
+    if provider is not None:
+        provider = provider.strip().lower()
+        if provider not in _TRANSCRIPTION_PROVIDERS:
+            raise WebUISettingsError("unknown transcription provider")
+        if transcription.provider != provider:
+            transcription.provider = provider  # type: ignore[assignment]
+            changed = True
+
+    model = _query_first(query, "model")
+    if model is not None:
+        model = model.strip() or None
+        if model is not None and len(model) > 200:
+            raise WebUISettingsError("transcription model is too long")
+        if transcription.model != model:
+            transcription.model = model
+            changed = True
+
+    language = _query_first(query, "language")
+    if language is not None:
+        language = language.strip().lower() or None
+        if language is not None and not re.fullmatch(r"[a-z]{2,3}", language):
+            raise WebUISettingsError("transcription language must be 2-3 lowercase letters")
+        if transcription.language != language:
+            transcription.language = language
+            changed = True
+
+    max_duration_sec = _query_first_alias(query, "max_duration_sec", "maxDurationSec")
+    if max_duration_sec is not None:
+        try:
+            parsed_duration = int(max_duration_sec)
+        except ValueError:
+            raise WebUISettingsError("max_duration_sec must be an integer") from None
+        if parsed_duration < 1 or parsed_duration > 600:
+            raise WebUISettingsError("max_duration_sec must be between 1 and 600")
+        if transcription.max_duration_sec != parsed_duration:
+            transcription.max_duration_sec = parsed_duration
+            changed = True
+
+    max_upload_mb = _query_first_alias(query, "max_upload_mb", "maxUploadMb")
+    if max_upload_mb is not None:
+        try:
+            parsed_upload = int(max_upload_mb)
+        except ValueError:
+            raise WebUISettingsError("max_upload_mb must be an integer") from None
+        if parsed_upload < 1 or parsed_upload > 100:
+            raise WebUISettingsError("max_upload_mb must be between 1 and 100")
+        if transcription.max_upload_mb != parsed_upload:
+            transcription.max_upload_mb = parsed_upload
+            changed = True
+
+    if changed:
+        save_config(config)
+    return settings_payload()
--- a/nanobot/webui/settings_routes.py
+++ b/nanobot/webui/settings_routes.py
@ -33,6 +33,7 @@ from nanobot.webui.settings_api import (
    update_model_configuration,
    update_network_safety_settings,
    update_provider_settings,
+    update_transcription_settings,
    update_web_search_settings,
 )

@ -100,6 +101,8 @@ class WebUISettingsRouter:
            return self._handle_settings_web_search_update(request)
        if path == "/api/settings/image-generation/update":
            return self._handle_settings_image_generation_update(request)
+        if path == "/api/settings/transcription/update":
+            return self._handle_settings_transcription_update(request)
        if path == "/api/settings/network-safety/update":
            return self._handle_settings_network_safety_update(request)
        if path == "/api/settings/cli-apps":
@ -275,6 +278,15 @@ class WebUISettingsRouter:
            return self._error_response(e.status, e.message)
        return self._json_response(self._with_restart_state(payload, section="image"))

+    def _handle_settings_transcription_update(self, request: WsRequest) -> Response:
+        if not self._authorized(request):
+            return self._unauthorized()
+        try:
+            payload = update_transcription_settings(self._query(request))
+        except WebUISettingsError as e:
+            return self._error_response(e.status, e.message)
+        return self._json_response(self._with_restart_state(payload))
+
    def _handle_settings_network_safety_update(self, request: WsRequest) -> Response:
        if not self._authorized(request):
            return self._unauthorized()
--- a/nanobot/webui/transcription_ws.py
+++ b/nanobot/webui/transcription_ws.py
@ -0,0 +1,46 @@
+"""WebUI transcription envelope handling.
+
+The WebSocket channel owns transport and subscription fan-out. This module owns
+the WebUI-specific audio transcription action carried over that socket.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from nanobot.audio.transcription import (
+    TranscriptionIngressError,
+    resolve_transcription_config,
+    transcribe_audio_data_url,
+)
+from nanobot.config.loader import load_config
+
+_MAX_REQUEST_ID_LENGTH = 80
+
+
+async def webui_transcription_event(envelope: dict[str, Any]) -> tuple[str, dict[str, Any]]:
+    """Return the WS event name and payload for one WebUI transcription request."""
+    request_id = envelope.get("request_id")
+    valid_request_id = (
+        isinstance(request_id, str)
+        and 0 < len(request_id) <= _MAX_REQUEST_ID_LENGTH
+    )
+
+    def error(detail: str, **extra: Any) -> tuple[str, dict[str, Any]]:
+        payload: dict[str, Any] = {"detail": detail, **extra}
+        if valid_request_id:
+            payload["request_id"] = request_id
+        return "transcription_error", payload
+
+    if not valid_request_id:
+        return error("invalid_request")
+
+    try:
+        text = await transcribe_audio_data_url(
+            envelope.get("data_url"),
+            resolve_transcription_config(load_config()),
+            duration_ms=envelope.get("duration_ms"),
+        )
+    except TranscriptionIngressError as exc:
+        return error(exc.detail, **exc.extra)
+    return "transcription_result", {"request_id": request_id, "text": text}
--- a/tests/channels/test_channel_plugins.py
+++ b/tests/channels/test_channel_plugins.py
@ -12,7 +12,8 @@ from nanobot.bus.events import OutboundMessage
 from nanobot.bus.queue import MessageBus
 from nanobot.channels.base import BaseChannel
 from nanobot.channels.manager import ChannelManager
-from nanobot.config.schema import ChannelsConfig
+from nanobot.config.loader import save_config
+from nanobot.config.schema import ChannelsConfig, Config
 from nanobot.providers.transcription import GroqTranscriptionProvider as _GroqProvider
 from nanobot.providers.transcription import OpenAITranscriptionProvider as _OpenAIProvider
 from nanobot.utils.restart import RestartNotice
@ -238,102 +239,103 @@ async def test_manager_loads_plugin_from_dict_config():


@pytest.mark.asyncio
-async def test_manager_propagates_groq_transcription_api_base_to_channels():
-    from nanobot.channels.manager import ChannelManager
-
-    fake_config = SimpleNamespace(
-        channels=ChannelsConfig.model_validate({
-            "fakeplugin": {"enabled": True, "allowFrom": ["*"]},
-            "transcriptionLanguage": "en",
-        }),
-        providers=SimpleNamespace(
-            groq=SimpleNamespace(api_key="groq-key", api_base="http://proxy.local/v1/audio/transcriptions"),
-            openai=SimpleNamespace(api_key="openai-key", api_base="https://api.openai.com/v1/audio/transcriptions"),
-        ),
-    )
-
-    with patch(
-        "nanobot.channels.registry.discover_enabled",
-        return_value={"fakeplugin": _FakePlugin},
-    ):
-        mgr = ChannelManager.__new__(ChannelManager)
-        mgr.config = fake_config
-        mgr.bus = MessageBus()
-        mgr.channels = {}
-        mgr._dispatch_task = None
-        mgr._init_channels()
-
-    channel = mgr.channels["fakeplugin"]
-    assert channel.transcription_provider == "groq"
-    assert channel.transcription_api_key == "groq-key"
-    assert channel.transcription_api_base == "http://proxy.local/v1/audio/transcriptions"
-    assert channel.transcription_language == "en"
-
-
-@pytest.mark.asyncio
-async def test_manager_propagates_openai_transcription_api_base_to_channels():
-    from nanobot.channels.manager import ChannelManager
-
-    fake_config = SimpleNamespace(
-        channels=ChannelsConfig.model_validate({
-            "fakeplugin": {"enabled": True, "allowFrom": ["*"]},
-            "transcriptionProvider": "openai",
-        }),
-        providers=SimpleNamespace(
-            openai=SimpleNamespace(
-                api_key="openai-key",
-                api_base="http://proxy.local/v1/audio/transcriptions",
-            ),
-            groq=SimpleNamespace(api_key="groq-key", api_base=""),
-        ),
-    )
-
-    with patch(
-        "nanobot.channels.registry.discover_enabled",
-        return_value={"fakeplugin": _FakePlugin},
-    ):
-        mgr = ChannelManager.__new__(ChannelManager)
-        mgr.config = fake_config
-        mgr.bus = MessageBus()
-        mgr.channels = {}
-        mgr._dispatch_task = None
-        mgr._init_channels()
-
-    channel = mgr.channels["fakeplugin"]
-    assert channel.transcription_provider == "openai"
-    assert channel.transcription_api_key == "openai-key"
-    assert channel.transcription_api_base == "http://proxy.local/v1/audio/transcriptions"
-
-
-@pytest.mark.asyncio
-async def test_base_channel_passes_api_base_to_openai_transcription_provider():
-    """BaseChannel.transcribe_audio must forward transcription_api_base to OpenAI."""
+async def test_base_channel_reads_current_transcription_config_each_call(
+    tmp_path,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """BaseChannel.transcribe_audio resolves config at call time, not manager init time."""
    from nanobot.providers import transcription as transcription_mod

-    channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus())
-    channel.transcription_provider = "openai"
-    channel.transcription_api_key = "k"
-    channel.transcription_api_base = "http://override/v1/audio/transcriptions"
-    channel.transcription_language = "en"
+    config_path = tmp_path / "config.json"
+    config = Config()
+    config.transcription.provider = "openai"
+    config.transcription.model = "whisper-custom"
+    config.transcription.language = "en"
+    config.providers.openai.api_key = "openai-key"
+    config.providers.openai.api_base = "http://openai.local/v1/audio/transcriptions"
+    save_config(config, config_path)
+    monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)

-    captured: dict[str, object] = {}
+    channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus())
+
+    calls: list[dict[str, object]] = []

    class _StubOpenAI:
-        def __init__(self, api_key=None, api_base=None, language=None):
-            captured["api_key"] = api_key
-            captured["api_base"] = api_base
-            captured["language"] = language
+        def __init__(self, api_key=None, api_base=None, language=None, model=None):
+            calls.append({
+                "provider": "openai",
+                "api_key": api_key,
+                "api_base": api_base,
+                "language": language,
+                "model": model,
+            })

        async def transcribe(self, file_path):
-            return "ok"
+            return "openai-ok"

-    with patch.object(transcription_mod, "OpenAITranscriptionProvider", _StubOpenAI):
-        result = await channel.transcribe_audio("/tmp/does-not-matter.wav")
+    class _StubGroq:
+        def __init__(self, api_key=None, api_base=None, language=None, model=None):
+            calls.append({
+                "provider": "groq",
+                "api_key": api_key,
+                "api_base": api_base,
+                "language": language,
+                "model": model,
+            })

-    assert result == "ok"
-    assert captured["api_key"] == "k"
-    assert captured["api_base"] == "http://override/v1/audio/transcriptions"
-    assert captured["language"] == "en"
+        async def transcribe(self, file_path):
+            return "groq-ok"
+
+    with (
+        patch.object(transcription_mod, "OpenAITranscriptionProvider", _StubOpenAI),
+        patch.object(transcription_mod, "GroqTranscriptionProvider", _StubGroq),
+    ):
+        assert await channel.transcribe_audio("/tmp/does-not-matter.wav") == "openai-ok"
+
+        config.transcription.provider = "groq"
+        config.transcription.model = "whisper-large-v3-turbo"
+        config.transcription.language = "ko"
+        config.providers.groq.api_key = "groq-key"
+        config.providers.groq.api_base = "http://groq.local/v1/audio/transcriptions"
+        save_config(config, config_path)
+
+        assert await channel.transcribe_audio("/tmp/does-not-matter.wav") == "groq-ok"
+
+    assert calls == [
+        {
+            "provider": "openai",
+            "api_key": "openai-key",
+            "api_base": "http://openai.local/v1/audio/transcriptions",
+            "language": "en",
+            "model": "whisper-custom",
+        },
+        {
+            "provider": "groq",
+            "api_key": "groq-key",
+            "api_base": "http://groq.local/v1/audio/transcriptions",
+            "language": "ko",
+            "model": "whisper-large-v3-turbo",
+        },
+    ]
+
+
+@pytest.mark.asyncio
+async def test_base_channel_respects_disabled_transcription_config(
+    tmp_path,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    config_path = tmp_path / "config.json"
+    config = Config()
+    config.transcription.enabled = False
+    config.providers.groq.api_key = "groq-key"
+    save_config(config, config_path)
+    monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
+
+    channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus())
+
+    with patch("nanobot.providers.transcription.GroqTranscriptionProvider") as provider:
+        assert await channel.transcribe_audio("/tmp/does-not-matter.wav") == ""
+    provider.assert_not_called()


 def test_openai_transcription_provider_honors_api_base_argument():
@ -348,37 +350,6 @@ def test_openai_transcription_provider_honors_api_base_argument():
    assert custom.api_url == "http://override/v1/audio/transcriptions"


-@pytest.mark.asyncio
-async def test_base_channel_passes_language_to_groq_transcription_provider():
-    """BaseChannel.transcribe_audio must forward transcription_language to Groq."""
-    from nanobot.providers import transcription as transcription_mod
-
-    channel = _FakePlugin({"enabled": True, "allowFrom": ["*"]}, MessageBus())
-    channel.transcription_provider = "groq"
-    channel.transcription_api_key = "k"
-    channel.transcription_api_base = "http://override/v1/audio/transcriptions"
-    channel.transcription_language = "ko"
-
-    captured: dict[str, object] = {}
-
-    class _StubGroq:
-        def __init__(self, api_key=None, api_base=None, language=None):
-            captured["api_key"] = api_key
-            captured["api_base"] = api_base
-            captured["language"] = language
-
-        async def transcribe(self, file_path):
-            return "ok"
-
-    with patch.object(transcription_mod, "GroqTranscriptionProvider", _StubGroq):
-        result = await channel.transcribe_audio("/tmp/does-not-matter.wav")
-
-    assert result == "ok"
-    assert captured["api_key"] == "k"
-    assert captured["api_base"] == "http://override/v1/audio/transcriptions"
-    assert captured["language"] == "ko"
-
-
 # ---------------------------------------------------------------------------
 # Transcription provider HTTP tests
 # ---------------------------------------------------------------------------
--- a/tests/channels/test_websocket_envelope_media.py
+++ b/tests/channels/test_websocket_envelope_media.py
@ -69,6 +69,7 @@ def _make_channel() -> WebSocketChannel:
    [
        ("data:image/png;base64,AAAA", "image/png"),
        ("data:image/jpeg;base64,AAAA", "image/jpeg"),
+        ("data:audio/webm;codecs=opus;base64,AAAA", "audio/webm"),
        ("data:IMAGE/PNG;base64,AAAA", "image/png"),
        ("data:image/svg+xml;base64,AAAA", "image/svg+xml"),
        ("data:text/plain;base64,AAAA", "text/plain"),
--- a/tests/channels/test_whatsapp_channel.py
+++ b/tests/channels/test_whatsapp_channel.py
@ -271,8 +271,6 @@ async def test_lid_to_phone_cache_resolves_lid_only_messages():
 async def test_voice_message_transcription_uses_media_path():
    """Voice messages are transcribed when media path is available."""
    ch = WhatsAppChannel({"enabled": True, "allowFrom": ["*"]}, MagicMock())
-    ch.transcription_provider = "openai"
-    ch.transcription_api_key = "sk-test"
    ch._handle_message = AsyncMock()
    ch.transcribe_audio = AsyncMock(return_value="Hello world")

--- a/tests/providers/test_transcription.py
+++ b/tests/providers/test_transcription.py
@ -8,6 +8,8 @@ from unittest.mock import AsyncMock, patch
 import httpx
 import pytest

+from nanobot.audio.transcription import resolve_transcription_config
+from nanobot.config.schema import Config
 from nanobot.providers.transcription import (
    GroqTranscriptionProvider,
    OpenAITranscriptionProvider,
@ -33,6 +35,65 @@ def _raw_response(status: int, content: bytes) -> httpx.Response:
    return httpx.Response(status_code=status, content=content, request=request)


+def test_resolver_uses_legacy_channel_provider_when_top_level_is_unset() -> None:
+    config = Config()
+    config.channels.transcription_provider = "openai"
+    config.channels.transcription_language = "en"
+    config.providers.openai.api_key = "sk-test"
+    config.providers.openai.api_base = "https://proxy.example/v1"
+
+    resolved = resolve_transcription_config(config)
+
+    assert resolved.provider == "openai"
+    assert resolved.model == "whisper-1"
+    assert resolved.language == "en"
+    assert resolved.api_key == "sk-test"
+    assert resolved.api_base == "https://proxy.example/v1"
+    assert resolved.configured is True
+
+
+def test_resolver_prefers_top_level_transcription_over_legacy_channels() -> None:
+    config = Config()
+    config.channels.transcription_provider = "openai"
+    config.channels.transcription_language = "en"
+    config.transcription.provider = "groq"
+    config.transcription.model = "whisper-large-v3-turbo"
+    config.transcription.language = "ko"
+    config.providers.groq.api_key = "gsk-test"
+    config.providers.groq.api_base = "https://groq.example/openai/v1"
+
+    resolved = resolve_transcription_config(config)
+
+    assert resolved.provider == "groq"
+    assert resolved.model == "whisper-large-v3-turbo"
+    assert resolved.language == "ko"
+    assert resolved.api_key == "gsk-test"
+    assert resolved.api_base == "https://groq.example/openai/v1"
+
+
+def test_resolved_transcription_repr_hides_api_key() -> None:
+    config = Config()
+    config.providers.groq.api_key = "gsk-secret"
+
+    resolved = resolve_transcription_config(config)
+
+    assert "gsk-secret" not in repr(resolved)
+    assert "api_key" not in repr(resolved)
+
+
+def test_resolver_keeps_enabled_and_limits_on_effective_config() -> None:
+    config = Config()
+    config.transcription.enabled = False
+    config.transcription.max_duration_sec = 45
+    config.transcription.max_upload_mb = 12
+
+    resolved = resolve_transcription_config(config)
+
+    assert resolved.enabled is False
+    assert resolved.max_duration_sec == 45
+    assert resolved.max_upload_mb == 12
+
+
 # ---------------------------------------------------------------------------
 # OpenAI provider — retry on transient HTTP + network errors
 # ---------------------------------------------------------------------------
@ -215,6 +276,32 @@ async def test_provider_omits_language_when_unset(
    assert "language" not in files


+@pytest.mark.asyncio
+async def test_provider_forwards_custom_model_in_multipart(audio_file: Path) -> None:
+    provider = GroqTranscriptionProvider(api_key="k", model="whisper-large-v3-turbo")
+    post = AsyncMock(return_value=_response(200, {"text": "ok"}))
+    with patch("httpx.AsyncClient.post", post), patch("asyncio.sleep", AsyncMock()):
+        result = await provider.transcribe(audio_file)
+
+    assert result == "ok"
+    files = post.await_args_list[0].kwargs["files"]
+    assert files["model"] == (None, "whisper-large-v3-turbo")
+
+
+@pytest.mark.asyncio
+async def test_provider_forwards_file_mime_type(tmp_path: Path) -> None:
+    audio = tmp_path / "voice.webm"
+    audio.write_bytes(b"audio")
+    provider = GroqTranscriptionProvider(api_key="k")
+    post = AsyncMock(return_value=_response(200, {"text": "ok"}))
+    with patch("httpx.AsyncClient.post", post), patch("asyncio.sleep", AsyncMock()):
+        result = await provider.transcribe(audio)
+
+    assert result == "ok"
+    files = post.await_args_list[0].kwargs["files"]
+    assert files["file"] == ("voice.webm", b"audio", "audio/webm")
+
+
@pytest.mark.asyncio
 async def test_language_survives_retry(audio_file: Path) -> None:
    """Regression: language must be present on every retry attempt, not just the first."""
--- a/tests/tools/test_exec_session_tools.py
+++ b/tests/tools/test_exec_session_tools.py
@ -6,8 +6,12 @@ import shlex
 import subprocess
 import sys

+from nanobot.agent.tools.exec_session import (
+    ExecSessionManager,
+    ListExecSessionsTool,
+    WriteStdinTool,
+)
 from nanobot.agent.tools.shell import ExecTool
-from nanobot.agent.tools.exec_session import ExecSessionManager, ListExecSessionsTool, WriteStdinTool


 def _python_command(code: str) -> str:
@ -141,7 +145,7 @@ def test_exec_can_continue_with_stdin(tmp_path):
        return initial, result

    initial, result = asyncio.run(run())
-    assert "ready" in initial
+    assert "ready" in initial + result
    assert "Process running" in initial
    assert "Elapsed:" in initial
    assert "got:ping" in result
@ -170,7 +174,7 @@ def test_write_stdin_can_close_stdin(tmp_path):
        return initial, result

    initial, result = asyncio.run(run())
-    assert "ready" in initial
+    assert "ready" in initial + result
    assert "got:payload" in result
    assert "Stdin closed." in result
    assert "Exit code: 0" in result
@ -185,14 +189,20 @@ def test_write_stdin_can_terminate_session(tmp_path):
            "import time; print('ready', flush=True); time.sleep(30)"
        )

-        initial = await exec_tool.execute(command=command, yield_time_ms=500)
+        initial = await exec_tool.execute(command=command, yield_time_ms=100)
        sid = _session_id(initial)
+        waited = await stdin_tool.execute(
+            session_id=sid,
+            wait_for="ready",
+            wait_timeout_ms=3000,
+            yield_time_ms=0,
+        )
        result = await stdin_tool.execute(
            session_id=sid,
            terminate=True,
            yield_time_ms=0,
        )
-        return initial, result
+        return initial + waited, result

    initial, result = asyncio.run(run())
    assert "ready" in initial
@ -243,7 +253,7 @@ def test_write_stdin_preserves_completed_session_output_until_polled(tmp_path):

    initial, final = asyncio.run(run())

-    assert "ready" in initial
+    assert "ready" in initial + final
    assert "done" in final
    assert "Exit code: 0" in final

--- a/tests/utils/test_media_decode.py
+++ b/tests/utils/test_media_decode.py
@ -8,8 +8,8 @@ import pytest

 from nanobot.utils.media_decode import (
    DEFAULT_MAX_BYTES,
-    FileSizeExceeded,
    MAX_FILE_SIZE,
+    FileSizeExceeded,
    save_base64_data_url,
 )

@ -25,6 +25,31 @@ def test_saves_png_with_correct_extension(tmp_path) -> None:
    assert (tmp_path / result.split("/")[-1]).read_bytes() == b"fake png"


+def test_saves_data_url_with_mime_parameters(tmp_path) -> None:
+    result = save_base64_data_url(_data_url(b"voice", mime="audio/webm;codecs=opus"), tmp_path)
+    assert result is not None
+    assert result.endswith(".webm")
+    assert (tmp_path / result.split("/")[-1]).read_bytes() == b"voice"
+
+
+@pytest.mark.parametrize(
+    ("mime", "suffix"),
+    [
+        ("audio/webm", ".webm"),
+        ("video/webm", ".webm"),
+        ("audio/ogg", ".ogg"),
+        ("audio/wav", ".wav"),
+        ("audio/mpga", ".mpga"),
+    ],
+)
+def test_saves_common_audio_with_api_friendly_extension(
+    tmp_path, mime: str, suffix: str
+) -> None:
+    result = save_base64_data_url(_data_url(b"voice", mime=mime), tmp_path)
+    assert result is not None
+    assert result.endswith(suffix)
+
+
 def test_returns_none_for_malformed_data_url(tmp_path) -> None:
    assert save_base64_data_url("not-a-data-url", tmp_path) is None

--- a/tests/webui/test_settings_api.py
+++ b/tests/webui/test_settings_api.py
@ -18,6 +18,7 @@ from nanobot.webui.settings_api import (
    update_agent_settings,
    update_model_configuration,
    update_network_safety_settings,
+    update_transcription_settings,
 )


@ -243,6 +244,75 @@ def test_settings_payload_includes_network_safety_fields(
    assert payload["advanced"]["ssrf_whitelist_count"] == 1


+def test_settings_payload_includes_effective_transcription_config(
+    tmp_path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    config_path = tmp_path / "config.json"
+    config = Config()
+    config.channels.transcription_provider = "openai"
+    config.channels.transcription_language = "en"
+    config.providers.openai.api_key = "sk-test"
+    save_config(config, config_path)
+    monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
+
+    payload = settings_payload()
+
+    assert payload["transcription"]["enabled"] is True
+    assert payload["transcription"]["provider"] == "openai"
+    assert payload["transcription"]["provider_configured"] is True
+    assert payload["transcription"]["model"] == "whisper-1"
+    assert payload["transcription"]["language"] == "en"
+
+
+def test_update_transcription_settings_writes_top_level_only(
+    tmp_path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    config_path = tmp_path / "config.json"
+    config = Config()
+    config.channels.transcription_provider = "openai"
+    config.channels.transcription_language = "en"
+    config.providers.groq.api_key = "gsk-test"
+    save_config(config, config_path)
+    monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
+
+    payload = update_transcription_settings(
+        {
+            "enabled": ["true"],
+            "provider": ["groq"],
+            "model": ["whisper-large-v3-turbo"],
+            "language": ["ko"],
+            "maxDurationSec": ["90"],
+            "maxUploadMb": ["20"],
+        }
+    )
+
+    saved = load_config(config_path)
+    assert saved.channels.transcription_provider == "openai"
+    assert saved.channels.transcription_language == "en"
+    assert saved.transcription.enabled is True
+    assert saved.transcription.provider == "groq"
+    assert saved.transcription.model == "whisper-large-v3-turbo"
+    assert saved.transcription.language == "ko"
+    assert saved.transcription.max_duration_sec == 90
+    assert saved.transcription.max_upload_mb == 20
+    assert payload["transcription"]["provider"] == "groq"
+    assert payload["transcription"]["provider_configured"] is True
+
+
+def test_update_transcription_settings_validates_language(
+    tmp_path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    config_path = tmp_path / "config.json"
+    save_config(Config(), config_path)
+    monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
+
+    with pytest.raises(WebUISettingsError, match="transcription language"):
+        update_transcription_settings({"language": ["en-US"]})
+
+
 def test_settings_payload_includes_token_usage_summary(
    tmp_path,
    monkeypatch: pytest.MonkeyPatch,
--- a/tests/webui/test_transcription_ws.py
+++ b/tests/webui/test_transcription_ws.py
@ -0,0 +1,129 @@
+"""Tests for WebUI transcription envelopes carried over the gateway socket."""
+
+from __future__ import annotations
+
+import base64
+from pathlib import Path
+from typing import Any
+
+import pytest
+
+from nanobot.config.loader import save_config
+from nanobot.config.schema import Config
+from nanobot.webui.transcription_ws import webui_transcription_event
+
+
+def _audio_data_url(payload: bytes = b"voice", mime: str = "audio/webm") -> str:
+    return f"data:{mime};base64,{base64.b64encode(payload).decode('ascii')}"
+
+
+@pytest.mark.asyncio
+async def test_webui_transcribe_audio_rejects_unconfigured_provider(
+    tmp_path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    config_path = tmp_path / "config.json"
+    config = Config()
+    config.transcription.provider = "groq"
+    save_config(config, config_path)
+    monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
+
+    event, payload = await webui_transcription_event({
+        "request_id": "voice-1",
+        "data_url": _audio_data_url(),
+    })
+
+    assert event == "transcription_error"
+    assert payload == {
+        "request_id": "voice-1",
+        "detail": "not_configured",
+        "provider": "groq",
+    }
+
+
+@pytest.mark.asyncio
+async def test_webui_transcribe_audio_rejects_unsupported_mime(
+    tmp_path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    config_path = tmp_path / "config.json"
+    config = Config()
+    config.transcription.provider = "groq"
+    config.providers.groq.api_key = "gsk-test"
+    save_config(config, config_path)
+    monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
+
+    event, payload = await webui_transcription_event({
+        "request_id": "voice-1",
+        "data_url": _audio_data_url(mime="text/plain"),
+    })
+
+    assert event == "transcription_error"
+    assert payload["request_id"] == "voice-1"
+    assert payload["detail"] == "mime"
+
+
+@pytest.mark.asyncio
+async def test_webui_transcribe_audio_rejects_oversized_audio(
+    tmp_path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    config_path = tmp_path / "config.json"
+    config = Config()
+    config.transcription.provider = "groq"
+    config.transcription.max_upload_mb = 1
+    config.providers.groq.api_key = "gsk-test"
+    save_config(config, config_path)
+    monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
+    monkeypatch.setattr("nanobot.audio.transcription.get_media_dir", lambda _channel=None: tmp_path)
+
+    event, payload = await webui_transcription_event({
+        "request_id": "voice-1",
+        "data_url": _audio_data_url(payload=b"x" * (1024 * 1024 + 1)),
+    })
+
+    assert event == "transcription_error"
+    assert payload["request_id"] == "voice-1"
+    assert payload["detail"] == "size"
+
+
+@pytest.mark.asyncio
+async def test_webui_transcribe_audio_returns_text_and_removes_temp_file(
+    tmp_path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    config_path = tmp_path / "config.json"
+    media_dir = tmp_path / "media"
+    media_dir.mkdir()
+    config = Config()
+    config.transcription.provider = "groq"
+    config.providers.groq.api_key = "gsk-test"
+    save_config(config, config_path)
+    monkeypatch.setattr("nanobot.config.loader._current_config_path", config_path)
+    monkeypatch.setattr(
+        "nanobot.audio.transcription.get_media_dir",
+        lambda _channel=None: media_dir,
+    )
+    captured_paths: list[Path] = []
+
+    async def fake_transcribe_audio_file(path: str | Path, _resolved: Any) -> str:
+        p = Path(path)
+        assert p.exists()
+        captured_paths.append(p)
+        return "hello voice"
+
+    monkeypatch.setattr(
+        "nanobot.audio.transcription.transcribe_audio_file",
+        fake_transcribe_audio_file,
+    )
+
+    event, payload = await webui_transcription_event({
+        "request_id": "voice-1",
+        "data_url": _audio_data_url(payload=b"webm voice", mime="audio/webm;codecs=opus"),
+        "duration_ms": 1200,
+    })
+
+    assert event == "transcription_result"
+    assert payload == {"request_id": "voice-1", "text": "hello voice"}
+    assert captured_paths
+    assert not captured_paths[0].exists()
--- a/webui/src/App.tsx
+++ b/webui/src/App.tsx
@ -81,6 +81,7 @@ const SETTINGS_SECTION_KEYS: SettingsSectionKey[] = [
  "appearance",
  "models",
  "image",
+  "voice",
  "browser",
  "apps",
  "skills",
--- a/webui/src/components/CodeBlock.tsx
+++ b/webui/src/components/CodeBlock.tsx
@ -1,8 +1,9 @@
-import { Suspense, lazy, useCallback, useState } from "react";
+import { Suspense, lazy, useCallback, useState, type ReactNode } from "react";
 import { Check, Copy } from "lucide-react";
 import { useTranslation } from "react-i18next";

 import { useThemeValue } from "@/hooks/useTheme";
+import { hasAnsi, parseAnsiSegments, stripAnsi } from "@/lib/ansi";
 import { cn } from "@/lib/utils";

 interface CodeBlockProps {
@ -36,6 +37,10 @@ const CODE_FONT_STACK = [
  "monospace",
 ].join(", ");

+const ANSI_LANGUAGES = new Set(["ansi", "ansi-output"]);
+const CODE_SURFACE_LIGHT = "#f4f4f5";
+const CODE_SURFACE_DARK = "#27272a";
+
 const LazyHighlightedCode = lazy(async () => {
  const [
    { default: SyntaxHighlighter },
@ -74,7 +79,11 @@ const LazyHighlightedCode = lazy(async () => {
          language={language || "text"}
          style={transparentTheme}
          customStyle={{
-            background: chrome === "none" ? "transparent" : undefined,
+            background: chrome === "none"
+              ? "transparent"
+              : isDark
+                ? CODE_SURFACE_DARK
+                : CODE_SURFACE_LIGHT,
            margin: 0,
            padding: chrome === "none" ? "0.75rem 1rem" : "1rem",
            fontFamily: CODE_FONT_STACK,
@ -83,10 +92,10 @@ const LazyHighlightedCode = lazy(async () => {
            tabSize: 2,
          }}
          codeTagProps={{
-            style: chrome === "none" ? {
+            style: {
              background: "transparent",
              fontFamily: CODE_FONT_STACK,
-            } : undefined,
+            },
          }}
          lineNumberStyle={{
            minWidth: "2.6em",
@ -106,14 +115,32 @@ const LazyHighlightedCode = lazy(async () => {
  };
 });

-function PlainCodeFallback({
+function renderPlainText(value: string): ReactNode {
+  return value;
+}
+
+function renderAnsiText(value: string): ReactNode {
+  return parseAnsiSegments(value).map((segment, index) => (
+    <span key={index} style={segment.style}>
+      {segment.text}
+    </span>
+  ));
+}
+
+function CodeTextBlock({
  code,
  chrome,
  showLineNumbers,
+  testId,
+  className,
+  renderText = renderPlainText,
 }: {
  code: string;
  chrome: "default" | "none";
  showLineNumbers: boolean;
+  testId: string;
+  className?: string;
+  renderText?: (value: string) => ReactNode;
 }) {
  const lines = code.split("\n");
  return (
@ -121,10 +148,11 @@ function PlainCodeFallback({
      className={cn(
        "m-0 overflow-x-auto p-4 font-mono text-sm leading-[1.6] text-foreground/90",
        showLineNumbers ? "whitespace-pre" : "whitespace-pre-wrap",
-        chrome === "default" ? "bg-background" : "bg-transparent",
+        chrome === "default" ? "bg-zinc-100 dark:bg-zinc-800" : "bg-transparent",
        chrome === "none" && "p-3 text-[13px] leading-[1.55]",
+        className,
      )}
-      data-testid="plain-code-fallback"
+      data-testid={testId}
    >
      <code className="text-inherit">
        {showLineNumbers ? (
@ -133,16 +161,21 @@ function PlainCodeFallback({
              <span className="w-10 shrink-0 select-none pr-4 text-right text-muted-foreground/60">
                {index + 1}
              </span>
-              <span className="whitespace-pre">{line || " "}</span>
+              <span className="whitespace-pre">{renderText(line || " ")}</span>
              {index < lines.length - 1 ? "\n" : null}
            </span>
          ))
-        ) : code}
+        ) : renderText(code)}
      </code>
    </pre>
  );
 }

+function shouldRenderAnsi(language: string | undefined, code: string): boolean {
+  const normalized = language?.trim().toLowerCase();
+  return Boolean((normalized && ANSI_LANGUAGES.has(normalized)) || hasAnsi(code));
+}
+
 export function CodeBlock({
  language,
  code,
@ -156,19 +189,20 @@ export function CodeBlock({
  const [copied, setCopied] = useState(false);
  const isDark = useThemeValue() === "dark";
  const hasChrome = chrome === "default";
+  const renderAnsi = shouldRenderAnsi(language, code);

  const onCopy = useCallback(() => {
    if (!navigator.clipboard) return;
-    navigator.clipboard.writeText(code).then(() => {
+    navigator.clipboard.writeText(renderAnsi ? stripAnsi(code) : code).then(() => {
      setCopied(true);
      setTimeout(() => setCopied(false), 1_500);
    });
-  }, [code]);
+  }, [code, renderAnsi]);

  return (
    <div
      className={cn(
-        "overflow-hidden",
+        "not-prose overflow-hidden",
        hasChrome && "rounded-lg border",
        hasChrome && (isDark ? "border-white/10" : "border-black/10"),
        className,
@ -177,7 +211,7 @@ export function CodeBlock({
      {hasChrome ? (
        <div
          className={cn(
-            "flex items-center justify-between px-4 py-1.5 text-xs font-medium",
+            "flex items-center justify-between px-4 pb-1.5 pt-2 text-xs font-medium",
            isDark
              ? "bg-zinc-800 text-zinc-300"
              : "bg-zinc-100 text-zinc-600",
@ -206,13 +240,22 @@ export function CodeBlock({
          </button>
        </div>
      ) : null}
-      {highlight ? (
+      {renderAnsi ? (
+        <CodeTextBlock
+          code={code}
+          chrome={chrome}
+          showLineNumbers={showLineNumbers}
+          testId="ansi-code"
+          renderText={renderAnsiText}
+        />
+      ) : highlight ? (
        <Suspense
          fallback={
-            <PlainCodeFallback
+            <CodeTextBlock
              code={code}
              chrome={chrome}
              showLineNumbers={showLineNumbers}
+              testId="plain-code-fallback"
            />
          }
        >
@ -226,10 +269,11 @@ export function CodeBlock({
          />
        </Suspense>
      ) : (
-        <PlainCodeFallback
+        <CodeTextBlock
          code={code}
          chrome={chrome}
          showLineNumbers={showLineNumbers}
+          testId="plain-code-fallback"
        />
      )}
    </div>
--- a/webui/src/components/settings/SettingsView.tsx
+++ b/webui/src/components/settings/SettingsView.tsx
@ -31,6 +31,7 @@ import {
  Layers,
  Loader2,
  LogOut,
+  Mic,
  Moon,
  PlayCircle,
  Plus,
@ -92,6 +93,7 @@ import {
  updateNetworkSafetySettings,
  updateProviderSettings,
  updateSettings,
+  updateTranscriptionSettings,
  updateWebSearchSettings,
 } from "@/lib/api";
 import { notifyCliAppsChanged } from "@/lib/cli-app-events";
@ -115,6 +117,7 @@ import type {
  ProviderModelsPayload,
  SettingsPayload,
  SkillSummary,
+  TranscriptionSettingsUpdate,
  WebSearchSettingsUpdate,
  WebuiDefaultAccessMode,
 } from "@/lib/types";
@ -124,6 +127,7 @@ export type SettingsSectionKey =
  | "appearance"
  | "models"
  | "image"
+  | "voice"
  | "browser"
  | "apps"
  | "skills"
@ -367,6 +371,26 @@ const DEFAULT_IMAGE_GENERATION_FORM: ImageGenerationSettingsUpdate = {
  maxImagesPerTurn: 4,
 };

+const DEFAULT_TRANSCRIPTION_FORM: TranscriptionSettingsUpdate = {
+  enabled: true,
+  provider: "groq",
+  model: "",
+  language: "",
+  maxDurationSec: 120,
+  maxUploadMb: 25,
+};
+
+const DEFAULT_TRANSCRIPTION_SETTINGS: NonNullable<SettingsPayload["transcription"]> = {
+  enabled: true,
+  provider: "groq",
+  provider_configured: false,
+  model: "whisper-large-v3",
+  language: null,
+  max_duration_sec: 120,
+  max_upload_mb: 25,
+  providers: [],
+};
+
 const DEFAULT_NETWORK_SAFETY_FORM: NetworkSafetySettingsUpdate = {
  webuiAllowLocalServiceAccess: true,
  webuiDefaultAccessMode: "default",
@ -419,6 +443,18 @@ function imageGenerationFormFromPayload(payload: SettingsPayload): ImageGenerati
  };
 }

+function transcriptionFormFromPayload(payload: SettingsPayload): TranscriptionSettingsUpdate {
+  const transcription = payload.transcription ?? DEFAULT_TRANSCRIPTION_SETTINGS;
+  return {
+    enabled: transcription.enabled,
+    provider: transcription.provider,
+    model: transcription.model,
+    language: transcription.language ?? "",
+    maxDurationSec: transcription.max_duration_sec,
+    maxUploadMb: transcription.max_upload_mb,
+  };
+}
+
 function networkSafetyFormFromPayload(payload: SettingsPayload): NetworkSafetySettingsUpdate {
  return {
    webuiAllowLocalServiceAccess:
@ -479,6 +515,7 @@ export function SettingsView({
  const [providerSaving, setProviderSaving] = useState<string | null>(null);
  const [webSearchSaving, setWebSearchSaving] = useState(false);
  const [imageGenerationSaving, setImageGenerationSaving] = useState(false);
+  const [transcriptionSaving, setTranscriptionSaving] = useState(false);
  const [networkSafetySaving, setNetworkSafetySaving] = useState(false);
  const [hostEngineApplying, setHostEngineApplying] = useState(false);
  const [error, setError] = useState<string | null>(null);
@ -511,6 +548,9 @@ export function SettingsView({
        ? imageGenerationFormFromPayload(initialSettings)
        : DEFAULT_IMAGE_GENERATION_FORM,
  );
+  const [transcriptionForm, setTranscriptionForm] = useState<TranscriptionSettingsUpdate>(
+    () => initialSettings ? transcriptionFormFromPayload(initialSettings) : DEFAULT_TRANSCRIPTION_FORM,
+  );
  const [networkSafetyForm, setNetworkSafetyForm] = useState<NetworkSafetySettingsUpdate>(() =>
    initialSettings ? networkSafetyFormFromPayload(initialSettings) : DEFAULT_NETWORK_SAFETY_FORM,
  );
@ -543,6 +583,7 @@ export function SettingsView({
    setForm(agentDraftFromPayload(payload));
    setWebSearchForm((prev) => webSearchFormFromPayload(payload, prev));
    setImageGenerationForm(imageGenerationFormFromPayload(payload));
+    setTranscriptionForm(transcriptionFormFromPayload(payload));
    setNetworkSafetyForm(networkSafetyFormFromPayload(payload));
    if (payload.restart_required_sections) {
      setPendingRestartSections(pendingRestartSectionsFromPayload(payload));
@ -711,6 +752,19 @@ export function SettingsView({
    );
  }, [imageGenerationForm, settings]);

+  const transcriptionDirty = useMemo(() => {
+    if (!settings) return false;
+    const transcription = settings.transcription ?? DEFAULT_TRANSCRIPTION_SETTINGS;
+    return (
+      transcriptionForm.enabled !== transcription.enabled ||
+      transcriptionForm.provider !== transcription.provider ||
+      transcriptionForm.model !== transcription.model ||
+      transcriptionForm.language !== (transcription.language ?? "") ||
+      transcriptionForm.maxDurationSec !== transcription.max_duration_sec ||
+      transcriptionForm.maxUploadMb !== transcription.max_upload_mb
+    );
+  }, [settings, transcriptionForm]);
+
  const networkSafetyDirty = useMemo(() => {
    if (!settings) return false;
    const currentLocalServiceAccess =
@ -913,6 +967,24 @@ export function SettingsView({
    }
  };

+  const saveTranscriptionSettings = async () => {
+    if (!settings || !transcriptionDirty || transcriptionSaving) return;
+    setTranscriptionSaving(true);
+    try {
+      const payload = await updateTranscriptionSettings(token, transcriptionForm);
+      applyPayload(payload);
+      if (payload.requires_restart) {
+        setPendingRestartSections((prev) => ({ ...prev, browser: true }));
+      }
+      await maybeRestartHostEngine(payload);
+      setError(null);
+    } catch (err) {
+      setError((err as Error).message);
+    } finally {
+      setTranscriptionSaving(false);
+    }
+  };
+
  const saveNetworkSafetySettings = async () => {
    if (!settings || !networkSafetyDirty || networkSafetySaving) return;
    setNetworkSafetySaving(true);
@ -1333,6 +1405,22 @@ export function SettingsView({
            requiresRestartPending={pendingRestartSections.image}
          />
        );
+      case "voice":
+        return (
+          <TranscriptionSettings
+            settings={settings}
+            form={transcriptionForm}
+            dirty={transcriptionDirty}
+            saving={transcriptionSaving}
+            onChangeForm={setTranscriptionForm}
+            onSave={saveTranscriptionSettings}
+            onOpenProviders={() => selectSection("models")}
+            showBrandLogos={localPrefs.brandLogos}
+            onRestart={restartViaSettingsSurface}
+            isRestarting={isRestarting || hostEngineApplying}
+            requiresRestartPending={pendingRestartSections.browser}
+          />
+        );
      case "browser":
        return (
          <WebSettings
@ -1523,6 +1611,7 @@ const SETTINGS_NAV_ITEMS: Array<{ key: SettingsSectionKey; icon: LucideIcon; fal
  { key: "appearance", icon: Palette, fallback: "Appearance" },
  { key: "models", icon: SlidersHorizontal, fallback: "Models" },
  { key: "image", icon: ImageIcon, fallback: "Image" },
+  { key: "voice", icon: Mic, fallback: "Voice" },
  { key: "browser", icon: Globe2, fallback: "Web" },
  { key: "runtime", icon: Server, fallback: "System" },
  { key: "advanced", icon: ShieldCheck, fallback: "Security" },
@ -1642,6 +1731,24 @@ function OverviewSettings({
  const webStatus = settings.web.enable
    ? tx("settings.values.enabled", "Enabled")
    : tx("settings.values.disabled", "Disabled");
+  const webSearchProvider =
+    settings.web_search.providers.find((provider) => provider.name === settings.web_search.provider) ??
+    settings.web_search.providers[0];
+  const webSearchProviderLabel = providerDisplayLabel(
+    settings.web_search.providers,
+    settings.web_search.provider,
+  );
+  const webSearchCredentialStatus =
+    webSearchProvider?.credential === "none"
+      ? tx("settings.byok.webSearch.noCredentialRequired", "No key required")
+      : webSearchProvider?.credential === "base_url"
+        ? settings.web_search.base_url
+          ? tx("settings.values.configured", "Configured")
+          : tx("settings.values.notConfigured", "Not configured")
+        : settings.web_search.api_key_hint
+          ? tx("settings.values.configured", "Configured")
+          : tx("settings.values.notConfigured", "Not configured");
+  const webCaption = `${webSearchProviderLabel} · ${webSearchCredentialStatus}`;
  const imageStatus = settings.image_generation.enabled
    ? tx("settings.values.enabled", "Enabled")
    : tx("settings.values.disabled", "Disabled");
@ -1650,6 +1757,15 @@ function OverviewSettings({
      ? tx("settings.values.configured", "Configured")
      : tx("settings.values.notConfigured", "Not configured")
  }`;
+  const transcription = settings.transcription ?? DEFAULT_TRANSCRIPTION_SETTINGS;
+  const voiceStatus = transcription.enabled
+    ? tx("settings.values.enabled", "Enabled")
+    : tx("settings.values.disabled", "Disabled");
+  const voiceCaption = `${providerDisplayLabel(transcription.providers, transcription.provider)} · ${
+    transcription.provider_configured
+      ? tx("settings.values.configured", "Configured")
+      : tx("settings.values.notConfigured", "Not configured")
+  }`;
  const isNativeHost = (settings.surface ?? settings.runtime_surface) === "native";
  const workspaceCaption = shortWorkspacePath(settings.runtime.workspace_path);
  const runtimeTitle = isNativeHost
@ -1691,8 +1807,8 @@ function OverviewSettings({
            icon={Globe2}
            valueLogoProvider={settings.web_search.provider}
            title={tx("settings.overview.webSearch", "Web search")}
-            value={providerDisplayLabel(settings.web_search.providers, settings.web_search.provider)}
-            caption={webStatus}
+            value={webStatus}
+            caption={webCaption}
            showBrandLogos={showBrandLogos}
            onClick={() => onSelectSection("browser")}
          />
@ -1705,6 +1821,15 @@ function OverviewSettings({
            showBrandLogos={showBrandLogos}
            onClick={() => onSelectSection("image")}
          />
+          <OverviewListRow
+            icon={Mic}
+            valueLogoProvider={transcription.provider}
+            title={tx("settings.overview.voiceInput", "Voice input")}
+            value={voiceStatus}
+            caption={voiceCaption}
+            showBrandLogos={showBrandLogos}
+            onClick={() => onSelectSection("voice")}
+          />
        </SettingsGroup>
      </section>

@ -2654,6 +2779,137 @@ function ImageGenerationSettings({
  );
 }

+function TranscriptionSettings({
+  settings,
+  form,
+  dirty,
+  saving,
+  onChangeForm,
+  onSave,
+  onOpenProviders,
+  showBrandLogos,
+  onRestart,
+  isRestarting,
+  requiresRestartPending,
+}: {
+  settings: SettingsPayload;
+  form: TranscriptionSettingsUpdate;
+  dirty: boolean;
+  saving: boolean;
+  onChangeForm: Dispatch<SetStateAction<TranscriptionSettingsUpdate>>;
+  onSave: () => void;
+  onOpenProviders: () => void;
+  showBrandLogos: boolean;
+  onRestart?: () => void;
+  isRestarting?: boolean;
+  requiresRestartPending: boolean;
+}) {
+  const { t } = useTranslation();
+  const tx = (key: string, fallback: string) => t(key, { defaultValue: fallback });
+  const transcription = settings.transcription ?? DEFAULT_TRANSCRIPTION_SETTINGS;
+  const selectedProvider =
+    transcription.providers.find((provider) => provider.name === form.provider) ??
+    transcription.providers[0];
+  const providerConfigured = !!selectedProvider?.configured;
+
+  return (
+    <section>
+      <SettingsSectionTitle>{tx("settings.sections.voiceInput", "Voice input")}</SettingsSectionTitle>
+      <SettingsGroup>
+        <SettingsRow
+          title={tx("settings.rows.transcription", "Transcription")}
+          description={tx("settings.help.transcription", "Transcribe microphone input before sending it. Chat channel voice messages use the same settings.")}
+        >
+          <ToggleButton
+            checked={form.enabled}
+            onChange={(enabled) => onChangeForm((prev) => ({ ...prev, enabled }))}
+            ariaLabel={tx("settings.rows.transcription", "Transcription")}
+            label={form.enabled ? tx("settings.values.on", "On") : tx("settings.values.off", "Off")}
+          />
+        </SettingsRow>
+        <SettingsRow
+          title={tx("settings.rows.transcriptionProvider", "Provider")}
+          description={tx("settings.help.transcriptionProvider", "Uses the matching provider credentials from Providers.")}
+        >
+          <ProviderPicker
+            providers={transcription.providers}
+            value={form.provider}
+            emptyLabel={tx("settings.voice.selectProvider", "Select provider")}
+            showProviderLogos={showBrandLogos}
+            onChange={(provider) => onChangeForm((prev) => ({ ...prev, provider }))}
+          />
+        </SettingsRow>
+        <SettingsRow
+          title={tx("settings.rows.transcriptionProviderStatus", "Provider status")}
+          description={tx("settings.help.transcriptionProviderStatus", "API keys stay under providers, not in transcription settings.")}
+        >
+          <div className="flex flex-wrap items-center justify-end gap-2">
+            <StatusPill tone={providerConfigured ? "success" : "neutral"}>
+              {providerConfigured
+                ? tx("settings.values.configured", "Configured")
+                : tx("settings.values.notConfigured", "Not configured")}
+            </StatusPill>
+            {!providerConfigured ? (
+              <Button size="sm" variant="outline" onClick={onOpenProviders} className="rounded-full">
+                {tx("settings.voice.configureProvider", "Configure provider")}
+              </Button>
+            ) : null}
+          </div>
+        </SettingsRow>
+        <SettingsRow
+          title={tx("settings.rows.transcriptionModel", "Model")}
+          description={tx("settings.help.transcriptionModel", "Leave as the resolved default unless your provider needs a custom model id.")}
+        >
+          <Input
+            value={form.model}
+            onChange={(event) => onChangeForm((prev) => ({ ...prev, model: event.target.value }))}
+            className="h-8 w-[min(300px,70vw)] rounded-full text-[13px]"
+          />
+        </SettingsRow>
+        <SettingsRow
+          title={tx("settings.rows.transcriptionLanguage", "Language")}
+          description={tx("settings.help.transcriptionLanguage", "Optional ISO-639 hint such as en, zh, ja, or ko.")}
+        >
+          <Input
+            value={form.language}
+            onChange={(event) => onChangeForm((prev) => ({ ...prev, language: event.target.value }))}
+            placeholder={tx("settings.voice.languageAuto", "Auto")}
+            className="h-8 w-[min(180px,60vw)] rounded-full text-[13px]"
+          />
+        </SettingsRow>
+        <SettingsRow title={tx("settings.rows.voiceLimits", "Limits")}>
+          <div className="flex flex-wrap justify-end gap-2">
+            <NumberInput
+              value={form.maxDurationSec}
+              min={1}
+              max={600}
+              suffix="s"
+              onChange={(maxDurationSec) => onChangeForm((prev) => ({ ...prev, maxDurationSec }))}
+            />
+            <NumberInput
+              value={form.maxUploadMb}
+              min={1}
+              max={100}
+              suffix="MB"
+              onChange={(maxUploadMb) => onChangeForm((prev) => ({ ...prev, maxUploadMb }))}
+            />
+          </div>
+        </SettingsRow>
+        <RestartSettingsFooter
+          dirty={dirty}
+          saving={saving}
+          pendingRestart={requiresRestartPending}
+          dirtyMessage={tx("settings.status.restartAfterSaving", "Save changes, then restart when ready.")}
+          pendingMessage={tx("settings.status.savedRestartApply", "Saved. Restart when ready.")}
+          onSave={onSave}
+          onRestart={onRestart}
+          isRestarting={isRestarting}
+        />
+      </SettingsGroup>
+    </section>
+  );
+}
+
 function WebSettings({
  settings,
  form,
--- a/webui/src/components/settings/TokenUsageHeatmap.tsx
+++ b/webui/src/components/settings/TokenUsageHeatmap.tsx
@ -78,16 +78,13 @@ function buildTokenUsageCalendar(
  const today = utcDateFromIsoDay(isoDayInTimeZone(new Date(), timeZone));
  const end = addUtcDays(today, 6 - today.getUTCDay());
  const start = addUtcDays(end, -(TOKEN_HEATMAP_CELLS - 1));
-  const seenMonths = new Set<string>();
  const monthLabels: TokenUsageMonthLabel[] = [];

  const cells = Array.from({ length: TOKEN_HEATMAP_CELLS }, (_, index) => {
    const date = addUtcDays(start, index);
    const key = isoDay(date);
    const row = byDate.get(key);
-    const monthKey = key.slice(0, 7);
-    if (!seenMonths.has(monthKey)) {
-      seenMonths.add(monthKey);
+    if (date.getUTCDate() === 1) {
      monthLabels.push({
        label: monthFormatter.format(date),
        column: Math.floor(index / 7) + 1,
@ -186,16 +183,12 @@ export function TokenUsageHeatmap({
            {tx("settings.usage.shortTitle", "Token Usage")}
          </span>
        </div>
-        <div
-          className="mb-2 grid min-h-4 gap-1.5 text-[10px] font-normal leading-4 text-muted-foreground/62"
-          style={{ gridTemplateColumns: `repeat(${TOKEN_HEATMAP_COLUMNS}, minmax(0, 1fr))` }}
-          aria-hidden
-        >
+        <div className="relative mb-2 h-4 text-[10px] font-normal leading-4 text-muted-foreground/62" aria-hidden>
          {monthLabels.map((month) => (
            <span
              key={`${month.label}-${month.column}`}
-              className="whitespace-nowrap"
-              style={{ gridColumnStart: month.column, gridColumnEnd: "span 4" }}
+              className="absolute top-0 whitespace-nowrap"
+              style={{ left: `${((month.column - 1) / TOKEN_HEATMAP_COLUMNS) * 100}%` }}
            >
              {month.label}
            </span>
--- a/webui/src/components/thread/ThreadComposer.tsx
+++ b/webui/src/components/thread/ThreadComposer.tsx
@ -31,6 +31,7 @@ import {
  History,
  ImageIcon,
  Loader2,
+  Mic,
  Plus,
  RotateCw,
  Shield,
@ -46,6 +47,12 @@ import {
 import { useTranslation } from "react-i18next";

 import { Button } from "@/components/ui/button";
+import {
+  Tooltip,
+  TooltipContent,
+  TooltipProvider,
+  TooltipTrigger,
+} from "@/components/ui/tooltip";
 import {
  WorkspaceAccessMenu,
  WorkspaceProjectPicker,
@ -59,6 +66,7 @@ import {
 } from "@/hooks/useAttachedImages";
 import { useClipboardAndDrop } from "@/hooks/useClipboardAndDrop";
 import type { SendImage, SendOptions } from "@/hooks/useNanobotStream";
+import { useVoiceRecorder, type VoiceRecorderErrorKey } from "@/hooks/useVoiceRecorder";
 import type {
  CliAppInfo,
  GoalStateWsPayload,
@ -79,6 +87,9 @@ import { cn } from "@/lib/utils";
 /** ``<input accept>``: aligned with the server's MIME whitelist. SVG is
 * deliberately excluded to avoid an embedded-script XSS surface. */
 const ACCEPT_ATTR = "image/png,image/jpeg,image/webp,image/gif";
+const VOICE_SHORTCUT_CODE = "KeyD";
+const VOICE_SHORTCUT_ARIA = "Control+Shift+D";
+type VoiceShortcutPlatform = "apple" | "chromeos" | "linux" | "other" | "windows";

 function formatBytes(n: number): string {
  if (n < 1024) return `${n} B`;
@ -86,6 +97,54 @@ function formatBytes(n: number): string {
  return `${(n / (1024 * 1024)).toFixed(1)} MB`;
 }

+function isVoiceShortcutDown(event: KeyboardEvent): boolean {
+  return (
+    event.code === VOICE_SHORTCUT_CODE
+    && event.ctrlKey
+    && event.shiftKey
+    && !event.altKey
+    && !event.metaKey
+  );
+}
+
+function isVoiceShortcutRelease(event: KeyboardEvent): boolean {
+  return (
+    event.code === VOICE_SHORTCUT_CODE
+    || event.key === "Control"
+    || event.key === "Shift"
+  );
+}
+
+function getVoiceShortcutPlatform(): VoiceShortcutPlatform {
+  if (typeof navigator === "undefined") return "other";
+  const userAgentData = (navigator as Navigator & { userAgentData?: { platform?: string } })
+    .userAgentData;
+  const platform = [
+    userAgentData?.platform,
+    navigator.platform,
+    navigator.userAgent,
+  ].filter(Boolean).join(" ").toLowerCase();
+  const isIpadPretendingToBeMac =
+    navigator.platform === "MacIntel" && navigator.maxTouchPoints > 1;
+  if (isIpadPretendingToBeMac || /mac|iphone|ipad|ipod/.test(platform)) return "apple";
+  if (/win/.test(platform)) return "windows";
+  if (/cros/.test(platform)) return "chromeos";
+  if (/linux|x11|android/.test(platform)) return "linux";
+  return "other";
+}
+
+function getVoiceShortcutLabel(): string {
+  switch (getVoiceShortcutPlatform()) {
+    case "apple":
+      return "⌃⇧D";
+    case "chromeos":
+    case "linux":
+    case "windows":
+    case "other":
+      return "Ctrl ⇧ D";
+  }
+}
+
 interface ThreadComposerProps {
  onSend: (content: string, images?: SendImage[], options?: SendOptions) => void;
  disabled?: boolean;
@ -101,6 +160,7 @@ interface ThreadComposerProps {
  cliApps?: CliAppInfo[];
  mcpPresets?: McpPresetInfo[];
  onStop?: () => void;
+  onTranscribeAudio?: (dataUrl: string, options?: { durationMs?: number }) => Promise<string>;
  /** Unix seconds from server; turn elapsed timer above input while set. */
  runStartedAt?: number | null;
  /** Sustained objective for this chat (WebSocket ``goal_state``). */
@ -138,6 +198,45 @@ const QUEUED_PROMPTS_STORAGE_PREFIX = "nanobot.webui.composerQueuedGuidance.v1:"
 const QUEUED_PROMPTS_LIMIT = 20;
 const QUEUED_PROMPT_MAX_CHARS = 4000;

+function VoiceRecordingMeter({
+  ariaLabel,
+  className,
+  elapsedLabel,
+  isHero,
+  levels,
+}: {
+  ariaLabel: string;
+  className?: string;
+  elapsedLabel: string;
+  isHero: boolean;
+  levels: number[];
+}) {
+  return (
+    <div
+      className={cn(
+        "flex min-w-0 items-center gap-2 text-neutral-700 dark:text-white",
+        isHero ? "h-8" : "h-9",
+        className,
+      )}
+      aria-live="polite"
+      aria-label={ariaLabel}
+    >
+      <span className="flex h-5 min-w-0 flex-1 items-center justify-between overflow-hidden" aria-hidden>
+        {levels.map((height, index) => (
+          <span
+            key={index}
+            className="w-[2px] rounded-full bg-current opacity-85 transition-[height] duration-75 ease-linear motion-reduce:transition-none"
+            style={{ height }}
+          />
+        ))}
+      </span>
+      <span className="min-w-[2.1rem] text-right text-[12px] font-medium tabular-nums text-muted-foreground">
+        {elapsedLabel}
+      </span>
+    </div>
+  );
+}
+
 type SlashPalettePlacement = "above" | "below";

 interface SlashPaletteLayout {
@ -656,6 +755,7 @@ export function ThreadComposer({
  cliApps = [],
  mcpPresets = [],
  onStop,
+  onTranscribeAudio,
  runStartedAt = null,
  goalState,
  workspaceScope = null,
@ -685,7 +785,9 @@ export function ThreadComposer({
  const wasStreamingRef = useRef(isStreaming);
  const skipNextQueuedFlushRef = useRef(false);
  const skipQueuedPromptPersistRef = useRef(false);
+  const voiceShortcutDownRef = useRef(false);
  const isHero = variant === "hero";
+  const voiceShortcutLabel = useMemo(getVoiceShortcutLabel, []);
  const queuedPromptStorageKey = useMemo(
    () => queuedPromptsStorageKey(pendingQueueKey),
    [pendingQueueKey],
@ -1026,6 +1128,65 @@ export function ThreadComposer({
    });
  }, []);

+  const appendTranscription = useCallback((text: string) => {
+    const transcript = text.trim();
+    if (!transcript) return;
+    setValue((current) => {
+      if (!current.trim()) return transcript;
+      const separator = /[\s\n]$/.test(current) ? "" : " ";
+      return `${current}${separator}${transcript}`;
+    });
+    setSlashMenuDismissed(false);
+    setCliAppMenuDismissed(false);
+    setInlineError(null);
+    resizeTextarea();
+  }, [resizeTextarea]);
+
+  const clearInlineError = useCallback(() => setInlineError(null), []);
+  const setVoiceError = useCallback((key: VoiceRecorderErrorKey) => {
+    setInlineError(t(`thread.composer.voiceErrors.${key}`));
+  }, [t]);
+  const voiceRecorder = useVoiceRecorder({
+    disabled,
+    onClearError: clearInlineError,
+    onError: setVoiceError,
+    onTranscript: appendTranscription,
+    onTranscribeAudio,
+  });
+
+  useEffect(() => {
+    if (!onTranscribeAudio) return;
+
+    function onKeyDown(event: KeyboardEvent): void {
+      if (!isVoiceShortcutDown(event) || event.repeat || voiceShortcutDownRef.current) return;
+      event.preventDefault();
+      voiceShortcutDownRef.current = true;
+      voiceRecorder.beginShortcutHold();
+    }
+
+    function onKeyUp(event: KeyboardEvent): void {
+      if (!voiceShortcutDownRef.current || !isVoiceShortcutRelease(event)) return;
+      event.preventDefault();
+      voiceShortcutDownRef.current = false;
+      voiceRecorder.endShortcutHold();
+    }
+
+    function onWindowBlur(): void {
+      if (!voiceShortcutDownRef.current) return;
+      voiceShortcutDownRef.current = false;
+      voiceRecorder.endShortcutHold();
+    }
+
+    window.addEventListener("keydown", onKeyDown);
+    window.addEventListener("keyup", onKeyUp);
+    window.addEventListener("blur", onWindowBlur);
+    return () => {
+      window.removeEventListener("keydown", onKeyDown);
+      window.removeEventListener("keyup", onKeyUp);
+      window.removeEventListener("blur", onWindowBlur);
+    };
+  }, [onTranscribeAudio, voiceRecorder.beginShortcutHold, voiceRecorder.endShortcutHold]);
+
  const chooseSlashCommand = useCallback(
    (command: SlashCommand) => {
      if (command.command === "/stop" && isStreaming && onStop) {
@ -1341,6 +1502,23 @@ export function ThreadComposer({
  );

  const attachButtonDisabled = disabled || full;
+  const showVoiceButton = Boolean(onTranscribeAudio);
+  const voiceRecordingStatusLabel = t("thread.composer.voice.recordingStatus", {
+    time: voiceRecorder.elapsedLabel,
+    defaultValue: `Recording ${voiceRecorder.elapsedLabel}`,
+  });
+  const voiceButtonLabel =
+    voiceRecorder.state === "recording"
+      ? t("thread.composer.voice.stop")
+      : voiceRecorder.state === "transcribing"
+        ? t("thread.composer.voice.transcribing")
+        : t("thread.composer.tools.voice");
+  const voiceButtonTooltip =
+    voiceRecorder.state === "recording"
+      ? t("thread.composer.voice.stop")
+      : voiceRecorder.state === "transcribing"
+        ? t("thread.composer.voice.transcribing")
+        : t("thread.composer.voice.hint");
  const showStopButton = isStreaming && !!onStop;
  const relaxedHeroInput = isHero && images.length === 0 && !isStreaming;
  const inputTextClasses = cn(
@ -1531,7 +1709,15 @@ export function ThreadComposer({
            >
              <Plus className={cn(isHero ? "h-[18px] w-[18px]" : "h-4 w-4")} />
            </Button>
-            {workspaceScope ? (
+            {voiceRecorder.isRecording ? (
+              <VoiceRecordingMeter
+                ariaLabel={voiceRecordingStatusLabel}
+                className="mx-1 flex-1"
+                elapsedLabel={voiceRecorder.elapsedLabel}
+                isHero={isHero}
+                levels={voiceRecorder.levels}
+              />
+            ) : workspaceScope ? (
              <WorkspaceAccessMenu
                scope={workspaceScope}
                disabled={disabled || workspaceScopeDisabled}
@ -1542,7 +1728,7 @@ export function ThreadComposer({
            ) : null}
          </div>
          <div className={cn("flex shrink-0 items-center", isHero ? "gap-1.5" : "gap-2")}>
-            {modelLabel ? (
+            {modelLabel && !voiceRecorder.isRecording ? (
              <ComposerModelBadge
                label={modelLabel}
                provider={modelProvider}
@ -1552,6 +1738,53 @@ export function ThreadComposer({
                onClick={modelNeedsSetup ? onModelBadgeClick : undefined}
              />
            ) : null}
+            {showVoiceButton ? (
+              <TooltipProvider delayDuration={220} skipDelayDuration={80}>
+                <Tooltip>
+                  <TooltipTrigger asChild>
+                    <Button
+                      type="button"
+                      size="icon"
+                      variant="ghost"
+                      disabled={voiceRecorder.buttonDisabled}
+                      aria-label={voiceButtonLabel}
+                      aria-keyshortcuts={VOICE_SHORTCUT_ARIA}
+                      title={voiceButtonTooltip}
+                      onPointerDown={voiceRecorder.beginPress}
+                      onPointerUp={voiceRecorder.endPress}
+                      onPointerCancel={voiceRecorder.endPress}
+                      onClick={voiceRecorder.handleClick}
+                      className={cn(
+                        "rounded-full border border-transparent text-muted-foreground hover:bg-muted/65 hover:text-foreground",
+                        isHero ? "h-8 w-8" : "h-9 w-9",
+                        voiceRecorder.isRecording &&
+                          "bg-red-500 text-white shadow-[0_8px_20px_rgba(239,68,68,0.22)] hover:bg-red-500 hover:text-white",
+                      )}
+                    >
+                      {voiceRecorder.state === "transcribing" ? (
+                        <Loader2 className={cn(isHero ? "h-4 w-4" : "h-4 w-4", "animate-spin")} />
+                      ) : voiceRecorder.isRecording ? (
+                        <Square className={cn(isHero ? "h-3.5 w-3.5" : "h-3.5 w-3.5")} fill="currentColor" />
+                      ) : (
+                        <Mic className={cn(isHero ? "h-4 w-4" : "h-4 w-4")} />
+                      )}
+                    </Button>
+                  </TooltipTrigger>
+                  <TooltipContent
+                    side="top"
+                    align="center"
+                    className="flex items-center gap-2 rounded-full border border-border/70 bg-background px-3 py-1.5 text-[13px] font-medium text-foreground shadow-[0_8px_24px_rgba(15,23,42,0.13)] dark:border-white/10 dark:bg-neutral-900 dark:text-white"
+                  >
+                    <span>{voiceButtonTooltip}</span>
+                    {voiceRecorder.state === "idle" ? (
+                      <kbd className="rounded-full bg-muted px-2 py-0.5 font-sans text-[12px] font-semibold leading-none text-muted-foreground dark:bg-white/10 dark:text-white/80">
+                        {voiceShortcutLabel}
+                      </kbd>
+                    ) : null}
+                  </TooltipContent>
+                </Tooltip>
+              </TooltipProvider>
+            ) : null}
            <Button
              type={showStopButton || modelNeedsSetup ? "button" : "submit"}
              size="icon"
--- a/webui/src/components/thread/ThreadShell.tsx
+++ b/webui/src/components/thread/ThreadShell.tsx
@ -302,6 +302,7 @@ export function ThreadShell({
    runStartedAt,
    goalState,
    send,
+    transcribeAudio,
    stop,
    setMessages,
    streamError,
@ -642,6 +643,7 @@ export function ThreadShell({
          cliApps={cliApps}
          mcpPresets={mcpPresets}
          onStop={stop}
+          onTranscribeAudio={transcribeAudio}
          runStartedAt={runStartedAt}
          goalState={goalState}
          workspaceScope={workspaceScope}
@ -672,6 +674,7 @@ export function ThreadShell({
          cliApps={cliApps}
          mcpPresets={mcpPresets}
          runStartedAt={runStartedAt}
+          onTranscribeAudio={transcribeAudio}
          goalState={goalState}
          workspaceScope={workspaceScope}
          workspaceDefaultScope={workspaceDefaultScope}
--- a/webui/src/hooks/useNanobotStream.ts
+++ b/webui/src/hooks/useNanobotStream.ts
@ -438,6 +438,7 @@ export function useNanobotStream(
  /** Latest sustained goal for this ``chatId`` (``goal_state`` WS events). */
  goalState: GoalStateWsPayload | undefined;
  send: (content: string, images?: SendImage[], options?: SendOptions) => void;
+  transcribeAudio: (dataUrl: string, options?: { durationMs?: number }) => Promise<string>;
  stop: () => void;
  setMessages: React.Dispatch<React.SetStateAction<UIMessage[]>>;
  /** Latest transport-level fault raised since the last ``dismissStreamError``.
@ -1089,12 +1090,19 @@ export function useNanobotStream(
    client.sendMessage(chatId, "/stop");
  }, [chatId, clearActivitySegment, client, flushPendingStreamEvents]);

+  const transcribeAudio = useCallback(
+    (dataUrl: string, options?: { durationMs?: number }) =>
+      client.transcribeAudio(dataUrl, options),
+    [client],
+  );
+
  return {
    messages,
    isStreaming,
    runStartedAt,
    goalState,
    send,
+    transcribeAudio,
    stop,
    setMessages,
    streamError,
--- a/webui/src/hooks/useVoiceRecorder.ts
+++ b/webui/src/hooks/useVoiceRecorder.ts
@ -0,0 +1,422 @@
+import {
+  useCallback,
+  useEffect,
+  useRef,
+  useState,
+  type PointerEvent as ReactPointerEvent,
+} from "react";
+
+const VOICE_RECORDING_MAX_MS = 120_000;
+const VOICE_RECORDING_MIN_MS = 650;
+const VOICE_NO_INPUT_HINT_MS = 1_100;
+const VOICE_HOLD_START_MS = 140;
+const VOICE_WAVEFORM_BAR_COUNT = 64;
+const VOICE_WAVEFORM_SILENT_HEIGHT = 3;
+const VOICE_WAVEFORM_MIN_HEIGHT = 7;
+const VOICE_WAVEFORM_MAX_HEIGHT = 34;
+const VOICE_MIN_LEVEL = 0.018;
+const VOICE_WAVEFORM_IDLE_LEVELS = Array.from(
+  { length: VOICE_WAVEFORM_BAR_COUNT },
+  () => VOICE_WAVEFORM_SILENT_HEIGHT,
+);
+const VOICE_MIME_CANDIDATES = [
+  "audio/webm;codecs=opus",
+  "audio/webm",
+  "audio/mp4",
+  "audio/ogg;codecs=opus",
+] as const;
+
+export type VoiceRecorderState = "idle" | "recording" | "transcribing";
+export type VoiceRecorderErrorKey =
+  | "failed"
+  | "noInput"
+  | "notConfigured"
+  | "permission"
+  | "tooLong"
+  | "tooShort"
+  | "unsupported";
+
+interface VoiceRecorderOptions {
+  disabled?: boolean;
+  onClearError: () => void;
+  onError: (key: VoiceRecorderErrorKey) => void;
+  onTranscript: (text: string) => void;
+  onTranscribeAudio?: (dataUrl: string, options?: { durationMs?: number }) => Promise<string>;
+}
+
+export function useVoiceRecorder({
+  disabled,
+  onClearError,
+  onError,
+  onTranscript,
+  onTranscribeAudio,
+}: VoiceRecorderOptions) {
+  const mediaRecorderRef = useRef<MediaRecorder | null>(null);
+  const chunksRef = useRef<BlobPart[]>([]);
+  const streamRef = useRef<MediaStream | null>(null);
+  const audioRef = useRef<VoiceAudioState | null>(null);
+  const startedAtRef = useRef(0);
+  const maxTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
+  const inputHintTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
+  const holdTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
+  const holdActiveRef = useRef(false);
+  const startPendingRef = useRef(false);
+  const stopAfterStartRef = useRef(false);
+  const suppressClickRef = useRef(false);
+  const suppressClickTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
+  const shortcutActiveRef = useRef(false);
+  const levelObservedRef = useRef(false);
+  const peakLevelRef = useRef(0);
+  const levelReliableRef = useRef(false);
+  const noInputHintVisibleRef = useRef(false);
+  const [state, setState] = useState<VoiceRecorderState>("idle");
+  const [elapsedMs, setElapsedMs] = useState(0);
+  const [levels, setLevels] = useState<number[]>(VOICE_WAVEFORM_IDLE_LEVELS);
+
+  const clearInputHintTimer = useCallback(() => clearTimer(inputHintTimerRef), []);
+  const clearSuppressClickTimer = useCallback(() => clearTimer(suppressClickTimerRef), []);
+
+  const suppressNextClick = useCallback(() => {
+    clearSuppressClickTimer();
+    suppressClickRef.current = true;
+    suppressClickTimerRef.current = setTimeout(() => {
+      suppressClickRef.current = false;
+      suppressClickTimerRef.current = null;
+    }, 500);
+  }, [clearSuppressClickTimer]);
+
+  const stopWaveform = useCallback(() => {
+    const audio = audioRef.current;
+    audioRef.current = null;
+    if (!audio) return;
+    if (audio.frame !== null) cancelAnimationFrame(audio.frame);
+    audio.source.disconnect();
+    audio.analyser.disconnect();
+    void audio.context.close().catch(() => undefined);
+  }, []);
+
+  const startWaveform = useCallback((stream: MediaStream) => {
+    const AudioContextCtor = audioContextConstructor();
+    if (!AudioContextCtor) return;
+    stopWaveform();
+    setLevels(VOICE_WAVEFORM_IDLE_LEVELS);
+    try {
+      const context = new AudioContextCtor();
+      const source = context.createMediaStreamSource(stream);
+      const analyser = context.createAnalyser();
+      analyser.fftSize = 256;
+      analyser.smoothingTimeConstant = 0.68;
+      source.connect(analyser);
+      const audio: VoiceAudioState = {
+        analyser,
+        context,
+        data: new Uint8Array(analyser.fftSize),
+        frame: null,
+        source,
+      };
+      const tick = () => {
+        const current = audioRef.current;
+        if (!current) return;
+        if (current.context.state !== "running") {
+          void current.context.resume().catch(() => undefined);
+          current.frame = requestAnimationFrame(tick);
+          return;
+        }
+        current.analyser.getByteTimeDomainData(current.data);
+        const level = voiceLevelFromSamples(current.data);
+        levelReliableRef.current = true;
+        levelObservedRef.current = true;
+        peakLevelRef.current = Math.max(peakLevelRef.current, level);
+        if (level >= VOICE_MIN_LEVEL) {
+          clearInputHintTimer();
+          if (noInputHintVisibleRef.current) {
+            noInputHintVisibleRef.current = false;
+            onClearError();
+          }
+        }
+        setLevels((currentLevels) => [
+          ...currentLevels.slice(1),
+          waveformHeightFromLevel(level),
+        ]);
+        current.frame = requestAnimationFrame(tick);
+      };
+      audioRef.current = audio;
+      void context.resume().catch(() => undefined);
+      audio.frame = requestAnimationFrame(tick);
+    } catch {
+      stopWaveform();
+    }
+  }, [clearInputHintTimer, onClearError, stopWaveform]);
+
+  const cleanupRecording = useCallback(() => {
+    clearTimer(holdTimerRef);
+    clearInputHintTimer();
+    clearTimer(maxTimerRef);
+    stopWaveform();
+    streamRef.current?.getTracks().forEach((track) => track.stop());
+    streamRef.current = null;
+    mediaRecorderRef.current = null;
+    startPendingRef.current = false;
+    shortcutActiveRef.current = false;
+    noInputHintVisibleRef.current = false;
+  }, [clearInputHintTimer, stopWaveform]);
+
+  const stopRecording = useCallback(() => {
+    const recorder = mediaRecorderRef.current;
+    if (!recorder || recorder.state === "inactive") return;
+    recorder.stop();
+  }, []);
+
+  const stopRecordingWhenReady = useCallback(() => {
+    const recorder = mediaRecorderRef.current;
+    if (recorder && recorder.state !== "inactive") {
+      stopRecording();
+    } else if (startPendingRef.current) {
+      stopAfterStartRef.current = true;
+    }
+  }, [stopRecording]);
+
+  const startRecording = useCallback(async () => {
+    if (!onTranscribeAudio || state !== "idle" || startPendingRef.current) return;
+    if (!navigator.mediaDevices?.getUserMedia || typeof MediaRecorder === "undefined") {
+      onError("unsupported");
+      return;
+    }
+    startPendingRef.current = true;
+    try {
+      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+      const recorder = new MediaRecorder(stream, mediaRecorderOptions());
+      chunksRef.current = [];
+      streamRef.current = stream;
+      mediaRecorderRef.current = recorder;
+      startedAtRef.current = Date.now();
+      levelObservedRef.current = false;
+      peakLevelRef.current = 0;
+      levelReliableRef.current = false;
+      noInputHintVisibleRef.current = false;
+      setElapsedMs(0);
+      startWaveform(stream);
+      recorder.ondataavailable = (event) => {
+        if (event.data.size > 0) chunksRef.current.push(event.data);
+      };
+      recorder.onstop = () => {
+        const chunks = chunksRef.current.splice(0);
+        const durationMs = Math.max(0, Date.now() - startedAtRef.current);
+        const mimeType = recorder.mimeType || "audio/webm";
+        const hasMeasuredSilence =
+          levelReliableRef.current
+          && levelObservedRef.current
+          && peakLevelRef.current < VOICE_MIN_LEVEL;
+        cleanupRecording();
+        if (chunks.length === 0) {
+          setState("idle");
+          return;
+        }
+        if (durationMs < VOICE_RECORDING_MIN_MS) {
+          setState("idle");
+          onError("tooShort");
+          return;
+        }
+        if (hasMeasuredSilence) {
+          setState("idle");
+          onError("noInput");
+          return;
+        }
+        setState("transcribing");
+        void blobToDataUrl(new Blob(chunks, { type: mimeType }))
+          .then((dataUrl) => onTranscribeAudio(dataUrl, { durationMs }))
+          .then(onTranscript)
+          .catch((error) => onError(transcriptionErrorKey(error)))
+          .finally(() => setState("idle"));
+      };
+      recorder.start();
+      setState("recording");
+      onClearError();
+      maxTimerRef.current = setTimeout(stopRecording, VOICE_RECORDING_MAX_MS);
+      inputHintTimerRef.current = setTimeout(() => {
+        const recording = mediaRecorderRef.current?.state === "recording";
+        if (
+          !recording
+          || !levelReliableRef.current
+          || !levelObservedRef.current
+          || peakLevelRef.current >= VOICE_MIN_LEVEL
+        ) {
+          return;
+        }
+        noInputHintVisibleRef.current = true;
+        onError("noInput");
+      }, VOICE_NO_INPUT_HINT_MS);
+    } catch {
+      cleanupRecording();
+      setState("idle");
+      onError("permission");
+    }
+  }, [
+    cleanupRecording,
+    onClearError,
+    onError,
+    onTranscribeAudio,
+    onTranscript,
+    startWaveform,
+    state,
+    stopRecording,
+  ]);
+
+  const startRecordingWithDeferredStop = useCallback(() => {
+    stopAfterStartRef.current = false;
+    void startRecording().then(() => {
+      if (!stopAfterStartRef.current) return;
+      stopAfterStartRef.current = false;
+      stopRecording();
+    });
+  }, [startRecording, stopRecording]);
+
+  const beginPress = useCallback((event: ReactPointerEvent<HTMLButtonElement>) => {
+    if (event.pointerType === "mouse" && event.button !== 0) return;
+    if (!onTranscribeAudio || disabled || state !== "idle") return;
+    clearTimer(holdTimerRef);
+    try {
+      event.currentTarget.setPointerCapture(event.pointerId);
+    } catch {
+      // Some embedded runtimes do not expose pointer capture for toolbar buttons.
+    }
+    holdTimerRef.current = setTimeout(() => {
+      holdTimerRef.current = null;
+      holdActiveRef.current = true;
+      suppressNextClick();
+      startRecordingWithDeferredStop();
+    }, VOICE_HOLD_START_MS);
+  }, [disabled, onTranscribeAudio, startRecordingWithDeferredStop, state, suppressNextClick]);
+
+  const endPress = useCallback(() => {
+    const wasHoldRecording = holdActiveRef.current;
+    clearTimer(holdTimerRef);
+    if (!wasHoldRecording) return;
+    holdActiveRef.current = false;
+    suppressNextClick();
+    stopRecordingWhenReady();
+  }, [stopRecordingWhenReady, suppressNextClick]);
+
+  const handleClick = useCallback(() => {
+    if (suppressClickRef.current) {
+      clearSuppressClickTimer();
+      suppressClickRef.current = false;
+      return;
+    }
+    if (state === "recording") stopRecording();
+    else void startRecording();
+  }, [clearSuppressClickTimer, startRecording, state, stopRecording]);
+
+  const beginShortcutHold = useCallback(() => {
+    if (!onTranscribeAudio || disabled || state !== "idle" || shortcutActiveRef.current) return;
+    shortcutActiveRef.current = true;
+    startRecordingWithDeferredStop();
+  }, [disabled, onTranscribeAudio, startRecordingWithDeferredStop, state]);
+
+  const endShortcutHold = useCallback(() => {
+    if (!shortcutActiveRef.current) return;
+    shortcutActiveRef.current = false;
+    stopRecordingWhenReady();
+  }, [stopRecordingWhenReady]);
+
+  useEffect(() => {
+    if (state !== "recording") {
+      setElapsedMs(0);
+      return;
+    }
+    const updateElapsed = () => {
+      setElapsedMs(Math.max(0, Date.now() - startedAtRef.current));
+    };
+    updateElapsed();
+    const interval = window.setInterval(updateElapsed, 250);
+    return () => window.clearInterval(interval);
+  }, [state]);
+
+  useEffect(() => cleanupRecording, [cleanupRecording]);
+  useEffect(() => () => clearSuppressClickTimer(), [clearSuppressClickTimer]);
+
+  return {
+    beginShortcutHold,
+    beginPress,
+    buttonDisabled: disabled || state === "transcribing",
+    elapsedLabel: formatVoiceElapsed(elapsedMs),
+    endShortcutHold,
+    endPress,
+    handleClick,
+    isRecording: state === "recording",
+    levels,
+    state,
+  };
+}
+
+interface VoiceAudioState {
+  analyser: AnalyserNode;
+  context: AudioContext;
+  data: Uint8Array<ArrayBuffer>;
+  frame: number | null;
+  source: MediaStreamAudioSourceNode;
+}
+
+function clearTimer(ref: { current: ReturnType<typeof setTimeout> | null }) {
+  if (ref.current !== null) {
+    clearTimeout(ref.current);
+    ref.current = null;
+  }
+}
+
+function mediaRecorderOptions(): MediaRecorderOptions | undefined {
+  if (typeof MediaRecorder === "undefined") return undefined;
+  const mimeType = VOICE_MIME_CANDIDATES.find((type) => MediaRecorder.isTypeSupported(type));
+  return mimeType ? { mimeType } : undefined;
+}
+
+function formatVoiceElapsed(ms: number): string {
+  const seconds = Math.max(0, Math.floor(ms / 1000));
+  const minutes = Math.floor(seconds / 60);
+  return `${minutes}:${String(seconds % 60).padStart(2, "0")}`;
+}
+
+function audioContextConstructor(): typeof AudioContext | undefined {
+  if (typeof window === "undefined") return undefined;
+  return window.AudioContext
+    ?? (window as unknown as { webkitAudioContext?: typeof AudioContext }).webkitAudioContext;
+}
+
+function voiceLevelFromSamples(samples: ArrayLike<number>): number {
+  if (samples.length === 0) return 0;
+  let sum = 0;
+  for (let index = 0; index < samples.length; index += 1) {
+    const centered = (samples[index] - 128) / 128;
+    sum += centered * centered;
+  }
+  const rms = Math.sqrt(sum / samples.length);
+  return Math.min(1, Math.pow(rms * 4.2, 0.72));
+}
+
+function waveformHeightFromLevel(level: number): number {
+  if (level < VOICE_MIN_LEVEL) return VOICE_WAVEFORM_SILENT_HEIGHT;
+  const activeLevel = Math.min(1, (level - VOICE_MIN_LEVEL) / (1 - VOICE_MIN_LEVEL));
+  return Math.round(
+    VOICE_WAVEFORM_MIN_HEIGHT
+      + activeLevel * (VOICE_WAVEFORM_MAX_HEIGHT - VOICE_WAVEFORM_MIN_HEIGHT),
+  );
+}
+
+function blobToDataUrl(blob: Blob): Promise<string> {
+  return new Promise((resolve, reject) => {
+    const reader = new FileReader();
+    reader.onload = () => {
+      if (typeof reader.result === "string") resolve(reader.result);
+      else reject(new Error("invalid_data_url"));
+    };
+    reader.onerror = () => reject(reader.error ?? new Error("read_failed"));
+    reader.readAsDataURL(blob);
+  });
+}
+
+function transcriptionErrorKey(error: unknown): VoiceRecorderErrorKey {
+  const detail = error instanceof Error ? error.message : "";
+  if (detail === "not_configured") return "notConfigured";
+  if (detail === "duration") return "tooLong";
+  return "failed";
+}
--- a/webui/src/i18n/locales/en/common.json
+++ b/webui/src/i18n/locales/en/common.json
@ -73,6 +73,7 @@
      "models": "Models",
      "providers": "Providers",
      "image": "Image",
+      "voice": "Voice",
      "browser": "Web",
      "cliApps": "CLI Apps",
      "mcp": "MCP",
@ -99,7 +100,8 @@
      "capabilities": "Capabilities",
      "apps": "Apps",
      "nativeHost": "Native host",
-      "hostSafety": "App safety"
+      "hostSafety": "App safety",
+      "voiceInput": "Voice input"
    },
    "models": {
      "selectModel": "Select model",
@ -161,7 +163,13 @@
      "engine": "Engine",
      "logs": "Logs",
      "diagnostics": "Diagnostics",
-      "contextWindow": "Context window"
+      "contextWindow": "Context window",
+      "transcription": "Transcription",
+      "transcriptionProvider": "Provider",
+      "transcriptionProviderStatus": "Provider status",
+      "transcriptionModel": "Model",
+      "transcriptionLanguage": "Language",
+      "voiceLimits": "Limits"
    },
    "help": {
      "theme": "Switch between light and dark appearance.",
@ -200,7 +208,12 @@
      "diagnostics": "Export a small runtime report for support.",
      "localServiceAccessNative": "Allow Full Access shell commands to reach services on this Mac.",
      "webuiDefaultAccessNative": "Used by native chats without a project-specific permission.",
-      "contextWindow": "Choose the default context budget for this model configuration."
+      "contextWindow": "Choose the default context budget for this model configuration.",
+      "transcription": "Transcribe microphone input before sending it. Chat channel voice messages use the same settings.",
+      "transcriptionProvider": "Uses the matching provider credentials from Providers.",
+      "transcriptionProviderStatus": "API keys stay under providers, not in transcription settings.",
+      "transcriptionModel": "Leave as the resolved default unless your provider needs a custom model id.",
+      "transcriptionLanguage": "Optional ISO-639 hint such as en, zh, ja, or ko."
    },
    "timezone": {
      "select": "Select timezone",
@ -391,6 +404,7 @@
      "totalProviders": "{{count}} available",
      "webSearch": "Web search",
      "imageGeneration": "Image generation",
+      "voiceInput": "Voice input",
      "workspace": "Workspace"
    },
    "usage": {
@ -486,6 +500,11 @@
      "rawInstructions": "Raw SKILL.md",
      "rawInstructionsEmpty": "No raw instructions.",
      "detailDescription": "Details for {{name}}."
+    },
+    "voice": {
+      "selectProvider": "Select provider",
+      "configureProvider": "Configure provider",
+      "languageAuto": "Auto"
    }
  },
  "chat": {
@ -678,6 +697,21 @@
        "deepResearch": "Deep research",
        "voice": "Voice input"
      },
+      "voice": {
+        "hint": "Click to dictate or hold",
+        "stop": "Stop recording",
+        "transcribing": "Transcribing...",
+        "recordingStatus": "Recording {{time}}"
+      },
+      "voiceErrors": {
+        "unsupported": "Voice input is not supported in this browser.",
+        "permission": "Microphone permission is required.",
+        "notConfigured": "Configure a transcription provider first.",
+        "tooLong": "Recording is too long.",
+        "tooShort": "Hold a little longer to record voice.",
+        "noInput": "No microphone input detected.",
+        "failed": "Could not transcribe audio."
+      },
      "slash": {
        "ariaLabel": "Slash commands",
        "label": "commands",
--- a/webui/src/i18n/locales/es/common.json
+++ b/webui/src/i18n/locales/es/common.json
@ -73,6 +73,7 @@
      "models": "Modelos",
      "providers": "Proveedores",
      "image": "Imagen",
+      "voice": "Voz",
      "browser": "Internet",
      "runtime": "Sistema",
      "advanced": "Seguridad",
@ -99,7 +100,8 @@
      "mcp": "Servicios MCP",
      "apps": "Aplicaciones",
      "nativeHost": "Host nativo",
-      "hostSafety": "Seguridad de la app"
+      "hostSafety": "Seguridad de la app",
+      "voiceInput": "Entrada de voz"
    },
    "rows": {
      "theme": "Tema",
@ -142,7 +144,13 @@
      "engine": "Motor",
      "logs": "Registros",
      "diagnostics": "Diagnóstico",
-      "contextWindow": "Ventana de contexto"
+      "contextWindow": "Ventana de contexto",
+      "transcription": "Transcripcion",
+      "transcriptionProvider": "Proveedor",
+      "transcriptionProviderStatus": "Estado del proveedor",
+      "transcriptionModel": "Modelo",
+      "transcriptionLanguage": "Idioma",
+      "voiceLimits": "Limites"
    },
    "help": {
      "theme": "Cambia entre apariencia clara y oscura.",
@ -181,7 +189,12 @@
      "diagnostics": "Exporta un pequeño informe de runtime para soporte.",
      "localServiceAccessNative": "Permite que comandos shell con Full Access alcancen servicios en este Mac.",
      "webuiDefaultAccessNative": "Usado por chats nativos sin permiso específico de proyecto.",
-      "contextWindow": "Elige el presupuesto de contexto predeterminado para esta configuración de modelo."
+      "contextWindow": "Elige el presupuesto de contexto predeterminado para esta configuración de modelo.",
+      "transcription": "Transcribe la entrada del microfono antes de enviarla. Los mensajes de voz de los canales usan la misma configuracion.",
+      "transcriptionProvider": "Usa las credenciales del proveedor correspondiente en Proveedores.",
+      "transcriptionProviderStatus": "Las claves API permanecen en proveedores, no en la configuracion de transcripcion.",
+      "transcriptionModel": "Dejalo como el valor predeterminado resuelto salvo que el proveedor necesite un id de modelo personalizado.",
+      "transcriptionLanguage": "Pista ISO-639 opcional, como en, zh, ja o ko."
    },
    "values": {
      "light": "Claro",
@ -283,6 +296,7 @@
      "totalProviders": "{{count}} disponibles",
      "webSearch": "Búsqueda web",
      "imageGeneration": "Generación de imágenes",
+      "voiceInput": "Entrada de voz",
      "workspace": "Espacio de trabajo"
    },
    "usage": {
@ -486,6 +500,11 @@
      "rawInstructions": "SKILL.md original",
      "rawInstructionsEmpty": "No hay instrucciones originales.",
      "detailDescription": "Detalles de {{name}}."
+    },
+    "voice": {
+      "selectProvider": "Seleccionar proveedor",
+      "configureProvider": "Configurar proveedor",
+      "languageAuto": "Auto"
    }
  },
  "chat": {
@ -678,6 +697,21 @@
        "deepResearch": "Investigación profunda",
        "voice": "Entrada de voz"
      },
+      "voice": {
+        "hint": "Haz clic para dictar o mantén",
+        "stop": "Detener grabación",
+        "transcribing": "Transcribiendo...",
+        "recordingStatus": "Grabando {{time}}"
+      },
+      "voiceErrors": {
+        "unsupported": "Este navegador no admite entrada de voz.",
+        "permission": "Se requiere permiso de micrófono.",
+        "notConfigured": "Configura primero un proveedor de transcripción.",
+        "tooLong": "La grabación es demasiado larga.",
+        "tooShort": "Mantén pulsado un poco más para grabar voz.",
+        "noInput": "No se detectó entrada del micrófono.",
+        "failed": "No se pudo transcribir el audio."
+      },
      "slash": {
        "ariaLabel": "Comandos slash",
        "label": "comandos",
--- a/webui/src/i18n/locales/fr/common.json
+++ b/webui/src/i18n/locales/fr/common.json
@ -73,6 +73,7 @@
      "models": "Modèles",
      "providers": "Fournisseurs",
      "image": "Images",
+      "voice": "Voix",
      "browser": "Internet",
      "runtime": "Système",
      "advanced": "Sécurité",
@ -99,7 +100,8 @@
      "mcp": "Services MCP",
      "apps": "Applications",
      "nativeHost": "Hôte natif",
-      "hostSafety": "Sécurité de l’app"
+      "hostSafety": "Sécurité de l’app",
+      "voiceInput": "Saisie vocale"
    },
    "rows": {
      "theme": "Thème",
@ -142,7 +144,13 @@
      "engine": "Moteur",
      "logs": "Journaux",
      "diagnostics": "Diagnostic",
-      "contextWindow": "Fenêtre de contexte"
+      "contextWindow": "Fenêtre de contexte",
+      "transcription": "Transcription",
+      "transcriptionProvider": "Fournisseur",
+      "transcriptionProviderStatus": "Etat du fournisseur",
+      "transcriptionModel": "Modele",
+      "transcriptionLanguage": "Langue",
+      "voiceLimits": "Limites"
    },
    "help": {
      "theme": "Basculer entre l’apparence claire et sombre.",
@ -181,7 +189,12 @@
      "diagnostics": "Exporte un petit rapport d’exécution pour le support.",
      "localServiceAccessNative": "Autorise les commandes shell Full Access à atteindre les services sur ce Mac.",
      "webuiDefaultAccessNative": "Utilisé par les chats natifs sans permission propre au projet.",
-      "contextWindow": "Choisissez le budget de contexte par défaut pour cette configuration de modèle."
+      "contextWindow": "Choisissez le budget de contexte par défaut pour cette configuration de modèle.",
+      "transcription": "Transcrit l'entree micro avant l'envoi. Les messages vocaux des canaux utilisent les memes reglages.",
+      "transcriptionProvider": "Utilise les identifiants du fournisseur correspondant dans Fournisseurs.",
+      "transcriptionProviderStatus": "Les cles API restent dans les fournisseurs, pas dans les reglages de transcription.",
+      "transcriptionModel": "Laissez le modele resolu par defaut sauf si votre fournisseur exige un id personnalise.",
+      "transcriptionLanguage": "Indice ISO-639 facultatif, comme en, zh, ja ou ko."
    },
    "values": {
      "light": "Clair",
@ -283,6 +296,7 @@
      "totalProviders": "{{count}} disponibles",
      "webSearch": "Recherche web",
      "imageGeneration": "Génération d’images",
+      "voiceInput": "Saisie vocale",
      "workspace": "Espace de travail"
    },
    "usage": {
@ -486,6 +500,11 @@
      "rawInstructions": "SKILL.md brut",
      "rawInstructionsEmpty": "Aucune instruction brute.",
      "detailDescription": "Détails de {{name}}."
+    },
+    "voice": {
+      "selectProvider": "Choisir un fournisseur",
+      "configureProvider": "Configurer le fournisseur",
+      "languageAuto": "Auto"
    }
  },
  "chat": {
@ -678,6 +697,21 @@
        "deepResearch": "Recherche approfondie",
        "voice": "Entrée vocale"
      },
+      "voice": {
+        "hint": "Cliquez pour dicter ou maintenez",
+        "stop": "Arrêter l'enregistrement",
+        "transcribing": "Transcription...",
+        "recordingStatus": "Enregistrement {{time}}"
+      },
+      "voiceErrors": {
+        "unsupported": "La saisie vocale n'est pas prise en charge par ce navigateur.",
+        "permission": "L'autorisation du microphone est requise.",
+        "notConfigured": "Configurez d'abord un fournisseur de transcription.",
+        "tooLong": "L'enregistrement est trop long.",
+        "tooShort": "Maintenez un peu plus longtemps pour enregistrer la voix.",
+        "noInput": "Aucune entrée microphone détectée.",
+        "failed": "Impossible de transcrire l'audio."
+      },
      "slash": {
        "ariaLabel": "Commandes slash",
        "label": "commandes",
--- a/webui/src/i18n/locales/id/common.json
+++ b/webui/src/i18n/locales/id/common.json
@ -73,6 +73,7 @@
      "models": "Model",
      "providers": "Penyedia",
      "image": "Gambar",
+      "voice": "Suara",
      "browser": "Internet",
      "runtime": "Sistem",
      "advanced": "Keamanan",
@ -99,7 +100,8 @@
      "mcp": "Layanan MCP",
      "apps": "Aplikasi",
      "nativeHost": "Host native",
-      "hostSafety": "Keamanan aplikasi"
+      "hostSafety": "Keamanan aplikasi",
+      "voiceInput": "Input suara"
    },
    "rows": {
      "theme": "Tema",
@ -142,7 +144,13 @@
      "engine": "Mesin",
      "logs": "Log",
      "diagnostics": "Diagnostik",
-      "contextWindow": "Jendela konteks"
+      "contextWindow": "Jendela konteks",
+      "transcription": "Transkripsi",
+      "transcriptionProvider": "Penyedia",
+      "transcriptionProviderStatus": "Status penyedia",
+      "transcriptionModel": "Model",
+      "transcriptionLanguage": "Bahasa",
+      "voiceLimits": "Batas"
    },
    "help": {
      "theme": "Beralih antara tampilan terang dan gelap.",
@ -181,7 +189,12 @@
      "diagnostics": "Exporta un pequeño informe de runtime para soporte.",
      "localServiceAccessNative": "Permite que comandos shell con Full Access alcancen servicios en este Mac.",
      "webuiDefaultAccessNative": "Usado por chats nativos sin permiso específico de proyecto.",
-      "contextWindow": "Pilih anggaran konteks default untuk konfigurasi model ini."
+      "contextWindow": "Pilih anggaran konteks default untuk konfigurasi model ini.",
+      "transcription": "Transkripsikan input mikrofon sebelum dikirim. Pesan suara channel memakai pengaturan yang sama.",
+      "transcriptionProvider": "Menggunakan kredensial penyedia yang sesuai dari Providers.",
+      "transcriptionProviderStatus": "API key tetap berada di providers, bukan di pengaturan transkripsi.",
+      "transcriptionModel": "Biarkan memakai default yang teresolusi kecuali penyedia membutuhkan id model khusus.",
+      "transcriptionLanguage": "Petunjuk ISO-639 opsional, seperti en, zh, ja, atau ko."
    },
    "values": {
      "light": "Terang",
@ -283,6 +296,7 @@
      "totalProviders": "{{count}} tersedia",
      "webSearch": "Pencarian web",
      "imageGeneration": "Pembuatan gambar",
+      "voiceInput": "Input suara",
      "workspace": "Ruang kerja"
    },
    "usage": {
@ -486,6 +500,11 @@
      "rawInstructions": "SKILL.md mentah",
      "rawInstructionsEmpty": "Tidak ada instruksi mentah.",
      "detailDescription": "Detail untuk {{name}}."
+    },
+    "voice": {
+      "selectProvider": "Pilih penyedia",
+      "configureProvider": "Konfigurasi penyedia",
+      "languageAuto": "Auto"
    }
  },
  "chat": {
@ -678,6 +697,21 @@
        "deepResearch": "Riset mendalam",
        "voice": "Input suara"
      },
+      "voice": {
+        "hint": "Klik untuk mendikte atau tahan",
+        "stop": "Hentikan rekaman",
+        "transcribing": "Mentranskripsi...",
+        "recordingStatus": "Merekam {{time}}"
+      },
+      "voiceErrors": {
+        "unsupported": "Input suara tidak didukung di browser ini.",
+        "permission": "Izin mikrofon diperlukan.",
+        "notConfigured": "Konfigurasikan penyedia transkripsi terlebih dahulu.",
+        "tooLong": "Rekaman terlalu panjang.",
+        "tooShort": "Tahan sedikit lebih lama untuk merekam suara.",
+        "noInput": "Tidak ada input mikrofon yang terdeteksi.",
+        "failed": "Tidak dapat mentranskripsi audio."
+      },
      "slash": {
        "ariaLabel": "Perintah slash",
        "label": "perintah",
--- a/webui/src/i18n/locales/ja/common.json
+++ b/webui/src/i18n/locales/ja/common.json
@ -73,6 +73,7 @@
      "models": "モデル",
      "providers": "プロバイダー",
      "image": "画像",
+      "voice": "音声",
      "browser": "ウェブ",
      "runtime": "システム",
      "advanced": "セキュリティ",
@ -99,7 +100,8 @@
      "mcp": "MCP サービス",
      "apps": "アプリ",
      "nativeHost": "ネイティブホスト",
-      "hostSafety": "アプリの安全性"
+      "hostSafety": "アプリの安全性",
+      "voiceInput": "音声入力"
    },
    "rows": {
      "theme": "テーマ",
@ -142,7 +144,13 @@
      "engine": "エンジン",
      "logs": "ログ",
      "diagnostics": "診断",
-      "contextWindow": "コンテキストウィンドウ"
+      "contextWindow": "コンテキストウィンドウ",
+      "transcription": "文字起こし",
+      "transcriptionProvider": "プロバイダー",
+      "transcriptionProviderStatus": "プロバイダー状態",
+      "transcriptionModel": "モデル",
+      "transcriptionLanguage": "言語",
+      "voiceLimits": "制限"
    },
    "help": {
      "theme": "ライト表示とダーク表示を切り替えます。",
@ -181,7 +189,12 @@
      "diagnostics": "サポート用の小さなランタイムレポートを書き出します。",
      "localServiceAccessNative": "Full Access の shell コマンドがこの Mac 上のサービスにアクセスできるようにします。",
      "webuiDefaultAccessNative": "プロジェクト固有の権限がないネイティブチャットで使用します。",
-      "contextWindow": "このモデル設定で使う既定のコンテキスト予算を選択します。"
+      "contextWindow": "このモデル設定で使う既定のコンテキスト予算を選択します。",
+      "transcription": "マイク入力を送信前に文字起こしします。チャネルの音声メッセージも同じ設定を使います。",
+      "transcriptionProvider": "プロバイダー設定にある対応する認証情報を使います。",
+      "transcriptionProviderStatus": "APIキーは文字起こし設定ではなくプロバイダー側に保存されます。",
+      "transcriptionModel": "プロバイダーがカスタムモデルIDを必要としない限り、解決済みのデフォルトのままにします。",
+      "transcriptionLanguage": "en、zh、ja、ko などの任意の ISO-639 ヒント。"
    },
    "values": {
      "light": "ライト",
@ -283,6 +296,7 @@
      "totalProviders": "{{count}} 個利用可能",
      "webSearch": "Web 検索",
      "imageGeneration": "画像生成",
+      "voiceInput": "音声入力",
      "workspace": "ワークスペース"
    },
    "usage": {
@ -486,6 +500,11 @@
      "rawInstructions": "元の SKILL.md",
      "rawInstructionsEmpty": "元の説明はありません。",
      "detailDescription": "{{name}} の詳細。"
+    },
+    "voice": {
+      "selectProvider": "プロバイダーを選択",
+      "configureProvider": "プロバイダーを設定",
+      "languageAuto": "自動"
    }
  },
  "chat": {
@ -678,6 +697,21 @@
        "deepResearch": "詳細調査",
        "voice": "音声入力"
      },
+      "voice": {
+        "hint": "クリックして音声入力、または長押し",
+        "stop": "録音を停止",
+        "transcribing": "文字起こし中...",
+        "recordingStatus": "録音中 {{time}}"
+      },
+      "voiceErrors": {
+        "unsupported": "このブラウザーは音声入力に対応していません。",
+        "permission": "マイクの許可が必要です。",
+        "notConfigured": "先に文字起こしプロバイダーを設定してください。",
+        "tooLong": "録音が長すぎます。",
+        "tooShort": "もう少し長く録音してください。",
+        "noInput": "マイク入力が検出されませんでした。",
+        "failed": "音声を文字起こしできませんでした。"
+      },
      "slash": {
        "ariaLabel": "スラッシュコマンド",
        "label": "コマンド",
--- a/webui/src/i18n/locales/ko/common.json
+++ b/webui/src/i18n/locales/ko/common.json
@ -73,6 +73,7 @@
      "models": "모델",
      "providers": "제공자",
      "image": "이미지",
+      "voice": "음성",
      "browser": "웹",
      "runtime": "시스템",
      "advanced": "보안",
@ -99,7 +100,8 @@
      "mcp": "MCP 서비스",
      "apps": "앱",
      "nativeHost": "네이티브 호스트",
-      "hostSafety": "앱 보안"
+      "hostSafety": "앱 보안",
+      "voiceInput": "음성 입력"
    },
    "rows": {
      "theme": "테마",
@ -142,7 +144,13 @@
      "engine": "엔진",
      "logs": "로그",
      "diagnostics": "진단",
-      "contextWindow": "컨텍스트 창"
+      "contextWindow": "컨텍스트 창",
+      "transcription": "전사",
+      "transcriptionProvider": "제공자",
+      "transcriptionProviderStatus": "제공자 상태",
+      "transcriptionModel": "모델",
+      "transcriptionLanguage": "언어",
+      "voiceLimits": "제한"
    },
    "help": {
      "theme": "밝은 모드와 어두운 모드를 전환합니다.",
@ -181,7 +189,12 @@
      "diagnostics": "지원용 작은 런타임 보고서를 내보냅니다.",
      "localServiceAccessNative": "Full Access shell 명령이 이 Mac의 서비스에 접근할 수 있게 합니다.",
      "webuiDefaultAccessNative": "프로젝트별 권한이 없는 네이티브 채팅에 사용됩니다.",
-      "contextWindow": "이 모델 구성의 기본 컨텍스트 예산을 선택합니다."
+      "contextWindow": "이 모델 구성의 기본 컨텍스트 예산을 선택합니다.",
+      "transcription": "마이크 입력을 보내기 전에 텍스트로 변환합니다. 채널 음성 메시지도 같은 설정을 사용합니다.",
+      "transcriptionProvider": "Providers에 저장된 해당 제공자의 인증 정보를 사용합니다.",
+      "transcriptionProviderStatus": "API 키는 transcription 설정이 아니라 providers 아래에 유지됩니다.",
+      "transcriptionModel": "제공자가 사용자 지정 모델 ID를 요구하지 않으면 해석된 기본값을 사용하세요.",
+      "transcriptionLanguage": "en, zh, ja, ko 같은 선택적 ISO-639 힌트입니다."
    },
    "values": {
      "light": "라이트",
@ -283,6 +296,7 @@
      "totalProviders": "{{count}}개 사용 가능",
      "webSearch": "웹 검색",
      "imageGeneration": "이미지 생성",
+      "voiceInput": "음성 입력",
      "workspace": "작업공간"
    },
    "usage": {
@ -486,6 +500,11 @@
      "rawInstructions": "원본 SKILL.md",
      "rawInstructionsEmpty": "원본 지침이 없습니다.",
      "detailDescription": "{{name}} 세부 정보."
+    },
+    "voice": {
+      "selectProvider": "제공자 선택",
+      "configureProvider": "제공자 설정",
+      "languageAuto": "자동"
    }
  },
  "chat": {
@ -678,6 +697,21 @@
        "deepResearch": "심층 조사",
        "voice": "음성 입력"
      },
+      "voice": {
+        "hint": "클릭해 받아쓰거나 길게 누르기",
+        "stop": "녹음 중지",
+        "transcribing": "변환 중...",
+        "recordingStatus": "녹음 중 {{time}}"
+      },
+      "voiceErrors": {
+        "unsupported": "이 브라우저는 음성 입력을 지원하지 않습니다.",
+        "permission": "마이크 권한이 필요합니다.",
+        "notConfigured": "먼저 음성 변환 제공업체를 설정하세요.",
+        "tooLong": "녹음 시간이 너무 깁니다.",
+        "tooShort": "음성을 녹음하려면 조금 더 길게 눌러 주세요.",
+        "noInput": "마이크 입력이 감지되지 않았습니다.",
+        "failed": "오디오를 변환하지 못했습니다."
+      },
      "slash": {
        "ariaLabel": "슬래시 명령",
        "label": "명령",
--- a/webui/src/i18n/locales/vi/common.json
+++ b/webui/src/i18n/locales/vi/common.json
@ -73,6 +73,7 @@
      "models": "Mô hình",
      "providers": "Nhà cung cấp",
      "image": "Hình ảnh",
+      "voice": "Giọng nói",
      "browser": "Trang web",
      "runtime": "Hệ thống",
      "advanced": "Bảo mật",
@ -99,7 +100,8 @@
      "mcp": "Dịch vụ MCP",
      "apps": "Ứng dụng",
      "nativeHost": "Host gốc",
-      "hostSafety": "An toàn ứng dụng"
+      "hostSafety": "An toàn ứng dụng",
+      "voiceInput": "Nhap giong noi"
    },
    "rows": {
      "theme": "Chủ đề",
@ -142,7 +144,13 @@
      "engine": "Bộ máy",
      "logs": "Nhật ký",
      "diagnostics": "Chẩn đoán",
-      "contextWindow": "Cửa sổ ngữ cảnh"
+      "contextWindow": "Cửa sổ ngữ cảnh",
+      "transcription": "Phien am",
+      "transcriptionProvider": "Nha cung cap",
+      "transcriptionProviderStatus": "Trang thai nha cung cap",
+      "transcriptionModel": "Mo hinh",
+      "transcriptionLanguage": "Ngon ngu",
+      "voiceLimits": "Gioi han"
    },
    "help": {
      "theme": "Chuyển giữa giao diện sáng và tối.",
@ -181,7 +189,12 @@
      "diagnostics": "Exporta un pequeño informe de runtime para soporte.",
      "localServiceAccessNative": "Permite que comandos shell con Full Access alcancen servicios en este Mac.",
      "webuiDefaultAccessNative": "Usado por chats nativos sin permiso específico de proyecto.",
-      "contextWindow": "Chọn ngân sách ngữ cảnh mặc định cho cấu hình mô hình này."
+      "contextWindow": "Chọn ngân sách ngữ cảnh mặc định cho cấu hình mô hình này.",
+      "transcription": "Phien am dau vao micro truoc khi gui. Tin nhan giong noi tu kenh chat dung cung cai dat.",
+      "transcriptionProvider": "Dung thong tin xac thuc cua nha cung cap tu Providers.",
+      "transcriptionProviderStatus": "API key nam trong providers, khong nam trong cai dat transcription.",
+      "transcriptionModel": "Giu mac dinh da resolve tru khi nha cung cap can id model tuy chinh.",
+      "transcriptionLanguage": "Goi y ISO-639 tuy chon, nhu en, zh, ja hoac ko."
    },
    "values": {
      "light": "Sáng",
@ -283,6 +296,7 @@
      "totalProviders": "{{count}} khả dụng",
      "webSearch": "Tìm kiếm web",
      "imageGeneration": "Tạo hình ảnh",
+      "voiceInput": "Nhập bằng giọng nói",
      "workspace": "Không gian làm việc"
    },
    "usage": {
@ -486,6 +500,11 @@
      "rawInstructions": "SKILL.md gốc",
      "rawInstructionsEmpty": "Không có hướng dẫn gốc.",
      "detailDescription": "Chi tiết cho {{name}}."
+    },
+    "voice": {
+      "selectProvider": "Chon nha cung cap",
+      "configureProvider": "Cau hinh nha cung cap",
+      "languageAuto": "Tu dong"
    }
  },
  "chat": {
@ -678,6 +697,21 @@
        "deepResearch": "Nghiên cứu sâu",
        "voice": "Nhập bằng giọng nói"
      },
+      "voice": {
+        "hint": "Bấm để đọc chính tả hoặc nhấn giữ",
+        "stop": "Dừng ghi âm",
+        "transcribing": "Đang chép lời...",
+        "recordingStatus": "Đang ghi {{time}}"
+      },
+      "voiceErrors": {
+        "unsupported": "Trình duyệt này không hỗ trợ nhập bằng giọng nói.",
+        "permission": "Cần quyền truy cập micrô.",
+        "notConfigured": "Hãy cấu hình nhà cung cấp chép lời trước.",
+        "tooLong": "Bản ghi âm quá dài.",
+        "tooShort": "Giữ lâu hơn một chút để ghi âm giọng nói.",
+        "noInput": "Không phát hiện đầu vào micrô.",
+        "failed": "Không thể chép lời âm thanh."
+      },
      "slash": {
        "ariaLabel": "Lệnh slash",
        "label": "lệnh",
--- a/webui/src/i18n/locales/zh-CN/common.json
+++ b/webui/src/i18n/locales/zh-CN/common.json
@ -73,6 +73,7 @@
      "models": "模型",
      "providers": "提供商",
      "image": "图片",
+      "voice": "语音",
      "browser": "网页",
      "cliApps": "CLI 应用",
      "mcp": "MCP",
@ -99,7 +100,8 @@
      "capabilities": "能力",
      "apps": "应用",
      "nativeHost": "原生宿主",
-      "hostSafety": "应用安全"
+      "hostSafety": "应用安全",
+      "voiceInput": "语音识别"
    },
    "models": {
      "selectModel": "选择模型",
@ -161,7 +163,13 @@
      "engine": "引擎",
      "logs": "日志",
      "diagnostics": "诊断",
-      "contextWindow": "上下文窗口"
+      "contextWindow": "上下文窗口",
+      "transcription": "语音转写",
+      "transcriptionProvider": "提供商",
+      "transcriptionProviderStatus": "提供商状态",
+      "transcriptionModel": "模型",
+      "transcriptionLanguage": "语言",
+      "voiceLimits": "限制"
    },
    "help": {
      "theme": "在浅色和深色外观之间切换。",
@ -200,7 +208,12 @@
      "diagnostics": "导出一份用于支持排查的小型运行报告。",
      "localServiceAccessNative": "允许完全访问权限下的 shell 命令访问这台 Mac 上的服务。",
      "webuiDefaultAccessNative": "用于没有单独项目权限的原生聊天。",
-      "contextWindow": "选择此模型配置的默认上下文预算。"
+      "contextWindow": "选择此模型配置的默认上下文预算。",
+      "transcription": "发送前先把麦克风输入转写到输入框。聊天渠道里的语音消息也使用同一套设置。",
+      "transcriptionProvider": "使用「提供商」中对应提供商的凭据。",
+      "transcriptionProviderStatus": "API Key 仍保存在 providers 里，不写进 transcription 设置。",
+      "transcriptionModel": "除非提供商需要自定义模型 ID，否则保持解析后的默认值即可。",
+      "transcriptionLanguage": "可选 ISO-639 语言提示，例如 en、zh、ja 或 ko。"
    },
    "timezone": {
      "select": "选择时区",
@ -391,6 +404,7 @@
      "totalProviders": "共 {{count}} 个可用",
      "webSearch": "网页搜索",
      "imageGeneration": "图片生成",
+      "voiceInput": "语音识别",
      "workspace": "工作区"
    },
    "usage": {
@ -486,6 +500,11 @@
      "rawInstructions": "原始 SKILL.md",
      "rawInstructionsEmpty": "没有原始说明。",
      "detailDescription": "{{name}} 的详情。"
+    },
+    "voice": {
+      "selectProvider": "选择提供商",
+      "configureProvider": "配置提供商",
+      "languageAuto": "自动"
    }
  },
  "chat": {
@ -677,6 +696,21 @@
        "deepResearch": "深度研究",
        "voice": "语音输入"
      },
+      "voice": {
+        "hint": "点击进行听写或长按",
+        "stop": "停止录音",
+        "transcribing": "正在转写...",
+        "recordingStatus": "正在录音 {{time}}"
+      },
+      "voiceErrors": {
+        "unsupported": "当前浏览器不支持语音输入。",
+        "permission": "需要麦克风权限。",
+        "notConfigured": "请先配置转写提供商。",
+        "tooLong": "录音时间太长。",
+        "tooShort": "请稍微多录一会儿。",
+        "noInput": "没有检测到麦克风输入。",
+        "failed": "语音转写失败。"
+      },
      "slash": {
        "ariaLabel": "斜杠命令",
        "label": "命令",
--- a/webui/src/i18n/locales/zh-TW/common.json
+++ b/webui/src/i18n/locales/zh-TW/common.json
@ -73,6 +73,7 @@
      "models": "模型",
      "providers": "提供商",
      "image": "圖片",
+      "voice": "語音",
      "browser": "網頁",
      "runtime": "系統",
      "advanced": "安全",
@ -99,7 +100,8 @@
      "mcp": "MCP 服務",
      "apps": "應用",
      "nativeHost": "原生宿主",
-      "hostSafety": "App 安全"
+      "hostSafety": "App 安全",
+      "voiceInput": "語音辨識"
    },
    "rows": {
      "theme": "主題",
@ -142,7 +144,13 @@
      "engine": "引擎",
      "logs": "日誌",
      "diagnostics": "診斷",
-      "contextWindow": "上下文視窗"
+      "contextWindow": "上下文視窗",
+      "transcription": "語音轉寫",
+      "transcriptionProvider": "提供商",
+      "transcriptionProviderStatus": "提供商狀態",
+      "transcriptionModel": "模型",
+      "transcriptionLanguage": "語言",
+      "voiceLimits": "限制"
    },
    "help": {
      "theme": "在淺色與深色外觀之間切換。",
@ -181,7 +189,12 @@
      "diagnostics": "匯出一份用於支援排查的小型執行報告。",
      "localServiceAccessNative": "允許完全訪問權限下的 shell 命令訪問這台 Mac 上的服務。",
      "webuiDefaultAccessNative": "用於沒有單獨專案權限的原生聊天。",
-      "contextWindow": "選擇此模型配置的預設上下文預算。"
+      "contextWindow": "選擇此模型配置的預設上下文預算。",
+      "transcription": "送出前先把麥克風輸入轉寫到輸入框。聊天渠道的語音訊息也使用同一組設定。",
+      "transcriptionProvider": "使用「提供商」中對應提供商的憑證。",
+      "transcriptionProviderStatus": "API Key 仍保存在 providers 裡，不寫進 transcription 設定。",
+      "transcriptionModel": "除非提供商需要自訂模型 ID，否則保持解析後的預設值即可。",
+      "transcriptionLanguage": "可選 ISO-639 語言提示，例如 en、zh、ja 或 ko。"
    },
    "values": {
      "light": "淺色",
@ -283,6 +296,7 @@
      "totalProviders": "共 {{count}} 個可用",
      "webSearch": "網頁搜尋",
      "imageGeneration": "圖片生成",
+      "voiceInput": "語音辨識",
      "workspace": "工作區"
    },
    "usage": {
@ -486,6 +500,11 @@
      "rawInstructions": "原始 SKILL.md",
      "rawInstructionsEmpty": "沒有原始說明。",
      "detailDescription": "{{name}} 的詳細資訊。"
+    },
+    "voice": {
+      "selectProvider": "選擇提供商",
+      "configureProvider": "設定提供商",
+      "languageAuto": "自動"
    }
  },
  "chat": {
@ -678,6 +697,21 @@
        "deepResearch": "深度研究",
        "voice": "語音輸入"
      },
+      "voice": {
+        "hint": "點擊進行聽寫或長按",
+        "stop": "停止錄音",
+        "transcribing": "正在轉寫...",
+        "recordingStatus": "正在錄音 {{time}}"
+      },
+      "voiceErrors": {
+        "unsupported": "目前瀏覽器不支援語音輸入。",
+        "permission": "需要麥克風權限。",
+        "notConfigured": "請先設定轉寫提供商。",
+        "tooLong": "錄音時間太長。",
+        "tooShort": "請稍微多錄一會兒。",
+        "noInput": "沒有偵測到麥克風輸入。",
+        "failed": "語音轉寫失敗。"
+      },
      "slash": {
        "ariaLabel": "斜線命令",
        "label": "命令",
--- a/webui/src/lib/ansi.ts
+++ b/webui/src/lib/ansi.ts
@ -0,0 +1,210 @@
+export type AnsiSegment = {
+  text: string;
+  style?: AnsiStyle;
+};
+
+export type AnsiStyle = {
+  backgroundColor?: string;
+  color?: string;
+  fontStyle?: "italic";
+  fontWeight?: number;
+  opacity?: number;
+  textDecorationLine?: "underline";
+};
+
+type AnsiState = {
+  backgroundColor?: string;
+  bold: boolean;
+  color?: string;
+  dim: boolean;
+  inverse: boolean;
+  italic: boolean;
+  underline: boolean;
+};
+
+const ESC = String.fromCharCode(27);
+const ANSI_PATTERN = new RegExp(`${ESC}\\[[0-?]*[ -/]*[@-~]`, "g");
+
+const ANSI_COLORS = [
+  "#000000",
+  "#cd3131",
+  "#0dbc79",
+  "#e5e510",
+  "#2472c8",
+  "#bc3fbc",
+  "#11a8cd",
+  "#e5e5e5",
+];
+
+const ANSI_BRIGHT_COLORS = [
+  "#666666",
+  "#f14c4c",
+  "#23d18b",
+  "#f5f543",
+  "#3b8eea",
+  "#d670d6",
+  "#29b8db",
+  "#ffffff",
+];
+
+const RGB_STEPS = [0, 95, 135, 175, 215, 255];
+
+export function hasAnsi(value: string): boolean {
+  ANSI_PATTERN.lastIndex = 0;
+  return ANSI_PATTERN.test(value);
+}
+
+export function stripAnsi(value: string): string {
+  ANSI_PATTERN.lastIndex = 0;
+  return value.replace(ANSI_PATTERN, "");
+}
+
+function initialState(): AnsiState {
+  return {
+    bold: false,
+    dim: false,
+    inverse: false,
+    italic: false,
+    underline: false,
+  };
+}
+
+function colorFrom256(value: number): string | undefined {
+  if (value < 0 || value > 255) return undefined;
+  if (value < 8) return ANSI_COLORS[value];
+  if (value < 16) return ANSI_BRIGHT_COLORS[value - 8];
+  if (value < 232) {
+    const offset = value - 16;
+    const red = RGB_STEPS[Math.floor(offset / 36)];
+    const green = RGB_STEPS[Math.floor((offset % 36) / 6)];
+    const blue = RGB_STEPS[offset % 6];
+    return `rgb(${red}, ${green}, ${blue})`;
+  }
+  const gray = 8 + ((value - 232) * 10);
+  return `rgb(${gray}, ${gray}, ${gray})`;
+}
+
+function colorFromRgb(red: number, green: number, blue: number): string | undefined {
+  if ([red, green, blue].some((value) => !Number.isFinite(value) || value < 0 || value > 255)) {
+    return undefined;
+  }
+  return `rgb(${red}, ${green}, ${blue})`;
+}
+
+function normalizedSgrParams(sequence: string): number[] | null {
+  if (!sequence.endsWith("m")) return null;
+  const body = sequence.slice(2, -1).trim();
+  if (!body) return [0];
+  return body.split(/[;:]/).map((part) => {
+    const value = Number.parseInt(part || "0", 10);
+    return Number.isFinite(value) ? value : 0;
+  });
+}
+
+function applyExtendedColor(
+  state: AnsiState,
+  params: number[],
+  index: number,
+  key: "color" | "backgroundColor",
+): number {
+  const mode = params[index + 1];
+  if (mode === 5) {
+    const color = colorFrom256(params[index + 2]);
+    if (color) state[key] = color;
+    return index + 2;
+  }
+  if (mode === 2) {
+    const color = colorFromRgb(params[index + 2], params[index + 3], params[index + 4]);
+    if (color) state[key] = color;
+    return index + 4;
+  }
+  return index;
+}
+
+function applySgrParams(state: AnsiState, params: number[]): void {
+  for (let index = 0; index < params.length; index += 1) {
+    const code = params[index];
+    if (code === 0) {
+      Object.assign(state, initialState());
+    } else if (code === 1) {
+      state.bold = true;
+      state.dim = false;
+    } else if (code === 2) {
+      state.dim = true;
+      state.bold = false;
+    } else if (code === 3) {
+      state.italic = true;
+    } else if (code === 4) {
+      state.underline = true;
+    } else if (code === 7) {
+      state.inverse = true;
+    } else if (code === 22) {
+      state.bold = false;
+      state.dim = false;
+    } else if (code === 23) {
+      state.italic = false;
+    } else if (code === 24) {
+      state.underline = false;
+    } else if (code === 27) {
+      state.inverse = false;
+    } else if (code === 39) {
+      delete state.color;
+    } else if (code === 49) {
+      delete state.backgroundColor;
+    } else if (code >= 30 && code <= 37) {
+      state.color = ANSI_COLORS[code - 30];
+    } else if (code >= 40 && code <= 47) {
+      state.backgroundColor = ANSI_COLORS[code - 40];
+    } else if (code >= 90 && code <= 97) {
+      state.color = ANSI_BRIGHT_COLORS[code - 90];
+    } else if (code >= 100 && code <= 107) {
+      state.backgroundColor = ANSI_BRIGHT_COLORS[code - 100];
+    } else if (code === 38) {
+      index = applyExtendedColor(state, params, index, "color");
+    } else if (code === 48) {
+      index = applyExtendedColor(state, params, index, "backgroundColor");
+    }
+  }
+}
+
+function styleFromState(state: AnsiState): AnsiStyle | undefined {
+  const foreground = state.inverse ? state.backgroundColor : state.color;
+  const background = state.inverse ? state.color : state.backgroundColor;
+  const style: AnsiStyle = {};
+  if (foreground) style.color = foreground;
+  if (background) style.backgroundColor = background;
+  if (state.bold) style.fontWeight = 700;
+  if (state.dim) style.opacity = 0.72;
+  if (state.italic) style.fontStyle = "italic";
+  if (state.underline) style.textDecorationLine = "underline";
+  return Object.keys(style).length ? style : undefined;
+}
+
+export function parseAnsiSegments(value: string): AnsiSegment[] {
+  const segments: AnsiSegment[] = [];
+  const state = initialState();
+  let cursor = 0;
+  ANSI_PATTERN.lastIndex = 0;
+
+  for (const match of value.matchAll(ANSI_PATTERN)) {
+    const index = match.index ?? 0;
+    if (index > cursor) {
+      segments.push({
+        text: value.slice(cursor, index),
+        style: styleFromState(state),
+      });
+    }
+    const params = normalizedSgrParams(match[0]);
+    if (params) applySgrParams(state, params);
+    cursor = index + match[0].length;
+  }
+
+  if (cursor < value.length) {
+    segments.push({
+      text: value.slice(cursor),
+      style: styleFromState(state),
+    });
+  }
+
+  return segments.filter((segment) => segment.text.length > 0);
+}
--- a/webui/src/lib/api.ts
+++ b/webui/src/lib/api.ts
@ -16,6 +16,7 @@ import type {
  SkillDetail,
  SkillsPayload,
  SlashCommand,
+  TranscriptionSettingsUpdate,
  WebSearchSettingsUpdate,
  WorkspacesPayload,
  WebuiThreadPersistedPayload,
@ -547,3 +548,21 @@ export async function updateImageGenerationSettings(
    token,
  );
 }
+
+export async function updateTranscriptionSettings(
+  token: string,
+  update: TranscriptionSettingsUpdate,
+  base: string = "",
+): Promise<SettingsPayload> {
+  const query = new URLSearchParams();
+  query.set("enabled", String(update.enabled));
+  query.set("provider", update.provider);
+  query.set("model", update.model);
+  query.set("language", update.language);
+  query.set("max_duration_sec", String(update.maxDurationSec));
+  query.set("max_upload_mb", String(update.maxUploadMb));
+  return request<SettingsPayload>(
+    `${base}/api/settings/transcription/update?${query}`,
+    token,
+  );
+}
--- a/webui/src/lib/nanobot-client.ts
+++ b/webui/src/lib/nanobot-client.ts
@ -95,6 +95,12 @@ interface PendingNewChat {
  timer: ReturnType<typeof setTimeout>;
 }

+interface PendingTranscription {
+  resolve: (text: string) => void;
+  reject: (err: Error) => void;
+  timer: ReturnType<typeof setTimeout>;
+}
+
 export interface NanobotClientOptions {
  url: string;
  reconnect?: boolean;
@ -132,6 +138,7 @@ export class NanobotClient {
  /** Latest ``goal_state`` snapshot per ``chat_id`` (multi-session isolation). */
  private goalStateByChatId = new Map<string, GoalStateWsPayload>();
  private pendingNewChat: PendingNewChat | null = null;
+  private pendingTranscriptions = new Map<string, PendingTranscription>();
  // Frames queued while the socket is not yet OPEN
  private sendQueue: Outbound[] = [];
  private reconnectAttempts = 0;
@ -320,6 +327,27 @@ export class NanobotClient {
    });
  }

+  transcribeAudio(
+    dataUrl: string,
+    options?: { durationMs?: number; timeoutMs?: number },
+  ): Promise<string> {
+    const requestId = crypto.randomUUID();
+    const timeoutMs = options?.timeoutMs ?? 120_000;
+    return new Promise<string>((resolve, reject) => {
+      const timer = setTimeout(() => {
+        this.pendingTranscriptions.delete(requestId);
+        reject(new Error("transcription timed out"));
+      }, timeoutMs);
+      this.pendingTranscriptions.set(requestId, { resolve, reject, timer });
+      this.queueSend({
+        type: "transcribe_audio",
+        request_id: requestId,
+        data_url: dataUrl,
+        ...(options?.durationMs !== undefined ? { duration_ms: options.durationMs } : {}),
+      });
+    });
+  }
+
  attach(chatId: string): void {
    this.knownChats.add(chatId);
    if (this.socket?.readyState === WS_OPEN) {
@ -425,6 +453,16 @@ export class NanobotClient {
      return;
    }

+    if (parsed.event === "transcription_result") {
+      this.resolveTranscription(parsed.request_id, parsed.text);
+      return;
+    }
+
+    if (parsed.event === "transcription_error") {
+      this.rejectTranscription(parsed.request_id, parsed.detail || "error");
+      return;
+    }
+
    if (parsed.event === "session_updated") {
      this.emitSessionUpdate(parsed.chat_id, parsed.scope, parsed.workspace_scope);
      return;
@ -500,6 +538,7 @@ export class NanobotClient {
      this.pendingNewChat.reject(new Error("socket closed"));
      this.pendingNewChat = null;
    }
+    this.rejectAllTranscriptions("socket closed");
    // Surface structured reasons *before* reconnect logic so the UI can
    // display the error even while the client transparently reconnects.
    // Browsers populate ``CloseEvent.code`` with the wire-level close code;
@ -528,6 +567,34 @@ export class NanobotClient {
    }
  }

+  private resolveTranscription(requestId: string, text: string): void {
+    const pending = this.pendingTranscriptions.get(requestId);
+    if (!pending) return;
+    clearTimeout(pending.timer);
+    this.pendingTranscriptions.delete(requestId);
+    pending.resolve(text);
+  }
+
+  private rejectTranscription(requestId: string | undefined, detail: string): void {
+    if (!requestId) {
+      this.rejectAllTranscriptions(detail);
+      return;
+    }
+    const pending = this.pendingTranscriptions.get(requestId);
+    if (!pending) return;
+    clearTimeout(pending.timer);
+    this.pendingTranscriptions.delete(requestId);
+    pending.reject(new Error(detail));
+  }
+
+  private rejectAllTranscriptions(detail: string): void {
+    for (const [requestId, pending] of this.pendingTranscriptions) {
+      clearTimeout(pending.timer);
+      pending.reject(new Error(detail));
+      this.pendingTranscriptions.delete(requestId);
+    }
+  }
+
  private scheduleReconnect(): void {
    this.setStatus("reconnecting");
    const attempt = this.reconnectAttempts++;
--- a/webui/src/lib/types.ts
+++ b/webui/src/lib/types.ts
@ -391,6 +391,23 @@ export interface SettingsPayload {
      default_api_base?: string | null;
    }>;
  };
+  transcription?: {
+    enabled: boolean;
+    provider: string;
+    provider_configured: boolean;
+    model: string;
+    language: string | null;
+    max_duration_sec: number;
+    max_upload_mb: number;
+    providers: Array<{
+      name: string;
+      label: string;
+      configured: boolean;
+      api_key_hint?: string | null;
+      api_base?: string | null;
+      default_api_base?: string | null;
+    }>;
+  };
  runtime: {
    config_path: string;
    workspace_path: string;
@ -680,6 +697,15 @@ export interface ImageGenerationSettingsUpdate {
  maxImagesPerTurn: number;
 }

+export interface TranscriptionSettingsUpdate {
+  enabled: boolean;
+  provider: string;
+  model: string;
+  language: string;
+  maxDurationSec: number;
+  maxUploadMb: number;
+}
+
 export interface SlashCommand {
  command: string;
  title: string;
@ -782,6 +808,13 @@ export type InboundEvent =
      scope?: "metadata" | "thread" | string;
      workspace_scope?: WorkspaceScopePayload;
    }
+  | { event: "transcription_result"; request_id: string; text: string }
+  | {
+      event: "transcription_error";
+      request_id?: string;
+      detail?: string;
+      provider?: string;
+    }
  | { event: "error"; chat_id?: string; detail?: string; reason?: string };

 /** Base64-encoded image attached to an outbound ``message`` envelope.
@ -845,6 +878,7 @@ export type Outbound =
  | { type: "new_chat"; workspace_scope?: WorkspaceScopePayload }
  | { type: "attach"; chat_id: string }
  | { type: "set_workspace_scope"; chat_id: string; workspace_scope: WorkspaceScopePayload }
+  | { type: "transcribe_audio"; request_id: string; data_url: string; duration_ms?: number }
  | {
      type: "message";
      chat_id: string;
--- a/webui/src/tests/app-layout.test.tsx
+++ b/webui/src/tests/app-layout.test.tsx
@ -1172,13 +1172,13 @@ describe("App layout", () => {

  it("restores the settings section from the URL hash after a page reload", async () => {
    mockFetchRoutes({ "/api/settings": baseSettingsPayload() });
-    window.history.replaceState(null, "", "/#/settings?section=models");
+    window.history.replaceState(null, "", "/#/settings?section=voice");

    render(<App />);

    await waitFor(() => expect(connectSpy).toHaveBeenCalled());
-    expect(await screen.findByRole("heading", { name: "Models" })).toBeInTheDocument();
-    expect(window.location.hash).toBe("#/settings?section=models");
+    expect(await screen.findByRole("heading", { name: "Voice input" })).toBeInTheDocument();
+    expect(window.location.hash).toBe("#/settings?section=voice");
  });

  it("updates the URL hash when switching settings sections", async () => {
@ -1197,6 +1197,11 @@ describe("App layout", () => {

    expect(await screen.findByRole("heading", { name: "Models" })).toBeInTheDocument();
    expect(window.location.hash).toBe("#/settings?section=models");
+
+    fireEvent.click(within(settingsNav).getByRole("button", { name: "Voice" }));
+
+    expect(await screen.findByRole("heading", { name: "Voice input" })).toBeInTheDocument();
+    expect(window.location.hash).toBe("#/settings?section=voice");
  });

  it("opens Apps from the main sidebar without replacing the sidebar", async () => {
--- a/webui/src/tests/code-block.test.tsx
+++ b/webui/src/tests/code-block.test.tsx
@ -1,4 +1,5 @@
 import { act, render, screen } from "@testing-library/react";
+import userEvent from "@testing-library/user-event";
 import { describe, expect, it, vi } from "vitest";

 import { CodeBlock } from "@/components/CodeBlock";
@ -87,6 +88,64 @@ describe("CodeBlock", () => {
    expect(screen.getByText("const value = 1;")).toBeInTheDocument();
  });

+  it("renders ANSI output without mounting the syntax highlighter", () => {
+    render(
+      <ThemeProvider theme="dark">
+        <CodeBlock
+          language="ansi"
+          code={"\x1b[32mPASS\x1b[0m <script>alert(1)</script>"}
+        />
+      </ThemeProvider>,
+    );
+
+    expect(screen.queryByTestId("highlighted-code")).not.toBeInTheDocument();
+    expect(screen.getByTestId("ansi-code")).toBeInTheDocument();
+    expect(screen.getByTestId("ansi-code").closest(".not-prose")).toBeTruthy();
+    expect(screen.getByText("ansi")).toBeInTheDocument();
+    expect(screen.getByText("PASS")).toHaveStyle({ color: "#0dbc79" });
+    expect(screen.getByText("<script>alert(1)</script>")).toBeInTheDocument();
+    expect(document.querySelector("script")).toBeNull();
+  });
+
+  it("detects ANSI sequences in regular code blocks", () => {
+    render(
+      <ThemeProvider theme="light">
+        <CodeBlock
+          language="text"
+          code={"\x1b[38;2;35;209;139mtruecolor\x1b[0m"}
+        />
+      </ThemeProvider>,
+    );
+
+    expect(screen.queryByTestId("highlighted-code")).not.toBeInTheDocument();
+    expect(screen.getByText("truecolor")).toHaveStyle({
+      color: "rgb(35, 209, 139)",
+    });
+  });
+
+  it("copies ANSI output as clean text", async () => {
+    const user = userEvent.setup();
+    const writeText = vi.fn().mockResolvedValue(undefined);
+    Object.defineProperty(navigator, "clipboard", {
+      configurable: true,
+      value: { writeText },
+    });
+
+    try {
+      render(
+        <ThemeProvider theme="dark">
+          <CodeBlock language="ansi" code={"\x1b[32mPASS\x1b[0m"} />
+        </ThemeProvider>,
+      );
+
+      await user.click(screen.getByRole("button", { name: /copy/i }));
+
+      expect(writeText).toHaveBeenCalledWith("PASS");
+    } finally {
+      Reflect.deleteProperty(navigator, "clipboard");
+    }
+  });
+
  it("reads theme from context without creating per-block observers", async () => {
    const originalMutationObserver = globalThis.MutationObserver;
    const observer = vi.fn();
--- a/webui/src/tests/nanobot-client.test.ts
+++ b/webui/src/tests/nanobot-client.test.ts
@ -412,6 +412,61 @@ describe("NanobotClient", () => {
    );
  });

+  it("sends transcription requests and resolves transcription results outside chat dispatch", async () => {
+    const client = new NanobotClient({
+      url: "ws://test",
+      reconnect: false,
+      socketFactory: (url) => new FakeSocket(url) as unknown as WebSocket,
+    });
+    const handler = vi.fn();
+    client.onChat("chat-a", handler);
+    client.connect();
+    lastSocket().fakeOpen();
+
+    const promise = client.transcribeAudio("data:audio/webm;base64,AAAA", {
+      durationMs: 1234,
+      timeoutMs: 1_000,
+    });
+    const frame = JSON.parse(lastSocket().sent.at(-1) as string);
+    expect(frame).toMatchObject({
+      type: "transcribe_audio",
+      data_url: "data:audio/webm;base64,AAAA",
+      duration_ms: 1234,
+    });
+    expect(typeof frame.request_id).toBe("string");
+
+    lastSocket().fakeMessage({
+      event: "transcription_result",
+      request_id: frame.request_id,
+      text: "hello from voice",
+    });
+    await expect(promise).resolves.toBe("hello from voice");
+    expect(handler).not.toHaveBeenCalled();
+  });
+
+  it("rejects pending transcription requests on server errors and socket close", async () => {
+    const client = new NanobotClient({
+      url: "ws://test",
+      reconnect: false,
+      socketFactory: (url) => new FakeSocket(url) as unknown as WebSocket,
+    });
+    client.connect();
+    lastSocket().fakeOpen();
+
+    const errored = client.transcribeAudio("data:audio/webm;base64,AAAA", { timeoutMs: 1_000 });
+    const errorFrame = JSON.parse(lastSocket().sent.at(-1) as string);
+    lastSocket().fakeMessage({
+      event: "transcription_error",
+      request_id: errorFrame.request_id,
+      detail: "not_configured",
+    });
+    await expect(errored).rejects.toThrow("not_configured");
+
+    const dropped = client.transcribeAudio("data:audio/webm;base64,BBBB", { timeoutMs: 1_000 });
+    lastSocket().close();
+    await expect(dropped).rejects.toThrow("socket closed");
+  });
+
  it("queues sends while connecting and flushes on open", () => {
    const client = new NanobotClient({
      url: "ws://test",
--- a/webui/src/tests/thread-composer.test.tsx
+++ b/webui/src/tests/thread-composer.test.tsx
@ -1,4 +1,4 @@
-import { fireEvent, render, screen, waitFor, within } from "@testing-library/react";
+import { act, fireEvent, render, screen, waitFor, within } from "@testing-library/react";
 import { afterEach, describe, expect, it, vi } from "vitest";

 import { ThreadComposer } from "@/components/thread/ThreadComposer";
@ -121,6 +121,7 @@ const MCP_PRESETS: McpPresetInfo[] = [
  },
 ];
 const ORIGINAL_INNER_HEIGHT = window.innerHeight;
+const ORIGINAL_MEDIA_DEVICES = navigator.mediaDevices;

 function mockBlobUrls() {
  Object.defineProperty(URL, "createObjectURL", {
@ -135,7 +136,16 @@ function mockBlobUrls() {

 afterEach(() => {
  vi.restoreAllMocks();
+  vi.unstubAllGlobals();
  Reflect.deleteProperty(window, "nanobotHost");
+  if (ORIGINAL_MEDIA_DEVICES) {
+    Object.defineProperty(navigator, "mediaDevices", {
+      configurable: true,
+      value: ORIGINAL_MEDIA_DEVICES,
+    });
+  } else {
+    Reflect.deleteProperty(navigator, "mediaDevices");
+  }
  window.localStorage.clear();
  Object.defineProperty(window, "innerHeight", {
    value: ORIGINAL_INNER_HEIGHT,
@ -161,6 +171,75 @@ function rect(init: Partial<DOMRect>): DOMRect {
  };
 }

+function mockVoiceRecorder(blob = new Blob(["voice"], { type: "audio/webm" })) {
+  const stopTrack = vi.fn();
+  const getUserMedia = vi.fn(async () => ({
+    getTracks: () => [{ stop: stopTrack }],
+  }));
+  Object.defineProperty(navigator, "mediaDevices", {
+    configurable: true,
+    value: { getUserMedia },
+  });
+
+  class FakeMediaRecorder {
+    static isTypeSupported = vi.fn((type: string) => type === "audio/webm");
+
+    state: RecordingState = "inactive";
+    mimeType = blob.type;
+    ondataavailable: ((event: BlobEvent) => void) | null = null;
+    onstop: (() => void) | null = null;
+
+    start() {
+      this.state = "recording";
+    }
+
+    stop() {
+      this.state = "inactive";
+      this.ondataavailable?.({ data: blob } as BlobEvent);
+      this.onstop?.();
+    }
+  }
+
+  vi.stubGlobal("MediaRecorder", FakeMediaRecorder);
+  return { getUserMedia, stopTrack };
+}
+
+function mockVoiceAudioInput(sample = 128, state: AudioContextState = "running") {
+  class FakeAudioContext {
+    state = state;
+
+    createMediaStreamSource() {
+      return { connect: vi.fn(), disconnect: vi.fn() };
+    }
+
+    createAnalyser() {
+      return {
+        fftSize: 256,
+        smoothingTimeConstant: 0,
+        disconnect: vi.fn(),
+        getByteTimeDomainData: (data: Uint8Array) => data.fill(sample),
+      };
+    }
+
+    close = vi.fn(async () => undefined);
+    resume = vi.fn(async () => undefined);
+  }
+
+  vi.stubGlobal("AudioContext", FakeAudioContext);
+  vi.spyOn(window, "requestAnimationFrame").mockImplementation((callback) =>
+    window.setTimeout(() => callback(performance.now()), 16) as unknown as number
+  );
+  vi.spyOn(window, "cancelAnimationFrame").mockImplementation((id) =>
+    window.clearTimeout(id as unknown as number)
+  );
+}
+
+async function waitForVoiceCapture(): Promise<void> {
+  await act(async () => {
+    await new Promise((resolve) => setTimeout(resolve, 700));
+  });
+}
+
 describe("ThreadComposer", () => {
  it("renders a readonly hero model composer when provided", () => {
    render(
@ -209,6 +288,245 @@ describe("ThreadComposer", () => {
    expect(screen.queryByText(/Enter to send/)).not.toBeInTheDocument();
  });

+  it("transcribes voice input into the composer without sending", async () => {
+    mockVoiceRecorder();
+    const onSend = vi.fn();
+    const onTranscribeAudio = vi.fn(async () => "hello voice");
+    render(
+      <ThreadComposer
+        onSend={onSend}
+        onTranscribeAudio={onTranscribeAudio}
+        placeholder="Type your message..."
+      />,
+    );
+
+    fireEvent.click(screen.getByRole("button", { name: "Voice input" }));
+    expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
+    await waitForVoiceCapture();
+    fireEvent.click(await screen.findByRole("button", { name: "Stop recording" }));
+
+    await waitFor(() => expect(onTranscribeAudio).toHaveBeenCalledWith(
+      expect.stringMatching(/^data:audio\/webm;base64,/),
+      expect.objectContaining({ durationMs: expect.any(Number) }),
+    ));
+    await waitFor(() => expect(screen.getByLabelText("Message input")).toHaveValue("hello voice"));
+    expect(onSend).not.toHaveBeenCalled();
+  });
+
+  it("does not start duplicate voice recordings while microphone access is pending", async () => {
+    const { getUserMedia, stopTrack } = mockVoiceRecorder();
+    let resolveStream: ((stream: MediaStream) => void) | undefined;
+    getUserMedia.mockImplementation(() => new Promise((resolve) => {
+      resolveStream = resolve as (stream: MediaStream) => void;
+    }));
+    const onTranscribeAudio = vi.fn(async () => "one recording");
+    render(
+      <ThreadComposer
+        onSend={vi.fn()}
+        onTranscribeAudio={onTranscribeAudio}
+        placeholder="Type your message..."
+      />,
+    );
+
+    const voiceButton = screen.getByRole("button", { name: "Voice input" });
+    fireEvent.click(voiceButton);
+    fireEvent.click(voiceButton);
+
+    expect(getUserMedia).toHaveBeenCalledTimes(1);
+
+    await act(async () => {
+      resolveStream?.({ getTracks: () => [{ stop: stopTrack }] } as unknown as MediaStream);
+    });
+    expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
+    await waitForVoiceCapture();
+    fireEvent.click(await screen.findByRole("button", { name: "Stop recording" }));
+
+    await waitFor(() => expect(onTranscribeAudio).toHaveBeenCalledTimes(1));
+    await waitFor(() => expect(screen.getByLabelText("Message input")).toHaveValue("one recording"));
+  });
+
+  it("supports press-and-hold voice recording", async () => {
+    mockVoiceRecorder();
+    const onSend = vi.fn();
+    const onTranscribeAudio = vi.fn(async () => "held voice");
+    render(
+      <ThreadComposer
+        onSend={onSend}
+        onTranscribeAudio={onTranscribeAudio}
+        placeholder="Type your message..."
+      />,
+    );
+
+    const voiceButton = screen.getByRole("button", { name: "Voice input" });
+    fireEvent.pointerDown(voiceButton, { button: 0, pointerId: 1, pointerType: "touch" });
+    await act(async () => {
+      await new Promise((resolve) => setTimeout(resolve, 180));
+    });
+    expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
+    await waitForVoiceCapture();
+    fireEvent.pointerUp(screen.getByRole("button", { name: "Stop recording" }), {
+      pointerId: 1,
+      pointerType: "touch",
+    });
+
+    await waitFor(() => expect(onTranscribeAudio).toHaveBeenCalled());
+    await waitFor(() => expect(screen.getByLabelText("Message input")).toHaveValue("held voice"));
+    expect(onSend).not.toHaveBeenCalled();
+  });
+
+  it("supports keyboard hold voice recording", async () => {
+    mockVoiceRecorder();
+    const onSend = vi.fn();
+    const onTranscribeAudio = vi.fn(async () => "shortcut voice");
+    render(
+      <ThreadComposer
+        onSend={onSend}
+        onTranscribeAudio={onTranscribeAudio}
+        placeholder="Type your message..."
+      />,
+    );
+
+    const voiceButton = screen.getByRole("button", { name: "Voice input" });
+    expect(voiceButton).toHaveAttribute("title", "Click to dictate or hold");
+    expect(voiceButton).toHaveAttribute("aria-keyshortcuts", "Control+Shift+D");
+    fireEvent.keyDown(window, { code: "KeyD", ctrlKey: true, key: "D", shiftKey: true });
+    expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
+    await waitForVoiceCapture();
+    fireEvent.keyUp(window, { code: "KeyD", ctrlKey: true, key: "D", shiftKey: true });
+
+    await waitFor(() => expect(onTranscribeAudio).toHaveBeenCalled());
+    await waitFor(() => expect(screen.getByLabelText("Message input")).toHaveValue("shortcut voice"));
+    expect(onSend).not.toHaveBeenCalled();
+  });
+
+  it("ignores the delayed click emitted after a long-press voice recording", async () => {
+    const { getUserMedia } = mockVoiceRecorder();
+    const onTranscribeAudio = vi.fn(async () => "held once");
+    render(
+      <ThreadComposer
+        onSend={vi.fn()}
+        onTranscribeAudio={onTranscribeAudio}
+        placeholder="Type your message..."
+      />,
+    );
+
+    const voiceButton = screen.getByRole("button", { name: "Voice input" });
+    fireEvent.pointerDown(voiceButton, { button: 0, pointerId: 1, pointerType: "touch" });
+    await act(async () => {
+      await new Promise((resolve) => setTimeout(resolve, 180));
+    });
+    expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
+    await waitForVoiceCapture();
+    fireEvent.pointerUp(screen.getByRole("button", { name: "Stop recording" }), {
+      pointerId: 1,
+      pointerType: "touch",
+    });
+    await waitFor(() => expect(screen.getByLabelText("Message input")).toHaveValue("held once"));
+
+    await act(async () => {
+      await new Promise((resolve) => setTimeout(resolve, 20));
+    });
+    fireEvent.click(screen.getByRole("button", { name: "Voice input" }));
+
+    expect(getUserMedia).toHaveBeenCalledTimes(1);
+    expect(onTranscribeAudio).toHaveBeenCalledTimes(1);
+  });
+
+  it("keeps existing text when voice transcription fails", async () => {
+    mockVoiceRecorder();
+    const onSend = vi.fn();
+    const onTranscribeAudio = vi.fn(async () => {
+      throw new Error("not_configured");
+    });
+    render(
+      <ThreadComposer
+        onSend={onSend}
+        onTranscribeAudio={onTranscribeAudio}
+        placeholder="Type your message..."
+      />,
+    );
+
+    const input = screen.getByLabelText("Message input");
+    fireEvent.change(input, { target: { value: "draft" } });
+    fireEvent.click(screen.getByRole("button", { name: "Voice input" }));
+    await waitForVoiceCapture();
+    fireEvent.click(await screen.findByRole("button", { name: "Stop recording" }));
+
+    await waitFor(() => {
+      expect(screen.getByText("Configure a transcription provider first.")).toBeInTheDocument();
+    });
+    expect(input).toHaveValue("draft");
+    expect(onSend).not.toHaveBeenCalled();
+  });
+
+  it("does not transcribe recordings that are too short", async () => {
+    mockVoiceRecorder();
+    const onTranscribeAudio = vi.fn(async () => "should not appear");
+    render(
+      <ThreadComposer
+        onSend={vi.fn()}
+        onTranscribeAudio={onTranscribeAudio}
+        placeholder="Type your message..."
+      />,
+    );
+
+    fireEvent.click(screen.getByRole("button", { name: "Voice input" }));
+    fireEvent.click(await screen.findByRole("button", { name: "Stop recording" }));
+
+    await waitFor(() => {
+      expect(screen.getByText("Hold a little longer to record voice.")).toBeInTheDocument();
+    });
+    expect(onTranscribeAudio).not.toHaveBeenCalled();
+  });
+
+  it("warns during recording when microphone input is silent", async () => {
+    mockVoiceRecorder();
+    mockVoiceAudioInput();
+    const onTranscribeAudio = vi.fn(async () => "should not appear");
+    render(
+      <ThreadComposer
+        onSend={vi.fn()}
+        onTranscribeAudio={onTranscribeAudio}
+        placeholder="Type your message..."
+      />,
+    );
+
+    fireEvent.click(screen.getByRole("button", { name: "Voice input" }));
+    expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
+    await act(async () => {
+      await new Promise((resolve) => setTimeout(resolve, 1_150));
+    });
+
+    expect(screen.getByText("No microphone input detected.")).toBeInTheDocument();
+    fireEvent.click(await screen.findByRole("button", { name: "Stop recording" }));
+    expect(onTranscribeAudio).not.toHaveBeenCalled();
+  });
+
+  it("does not treat unavailable microphone levels as silence", async () => {
+    mockVoiceRecorder();
+    mockVoiceAudioInput(128, "suspended");
+    const onTranscribeAudio = vi.fn(async () => "voice text");
+    render(
+      <ThreadComposer
+        onSend={vi.fn()}
+        onTranscribeAudio={onTranscribeAudio}
+        placeholder="Type your message..."
+      />,
+    );
+
+    fireEvent.click(screen.getByRole("button", { name: "Voice input" }));
+    expect(await screen.findByLabelText("Recording 0:00")).toBeInTheDocument();
+    await act(async () => {
+      await new Promise((resolve) => setTimeout(resolve, 1_150));
+    });
+
+    expect(screen.queryByText("No microphone input detected.")).not.toBeInTheDocument();
+    fireEvent.click(await screen.findByRole("button", { name: "Stop recording" }));
+
+    await waitFor(() => expect(onTranscribeAudio).toHaveBeenCalledTimes(1));
+    expect(screen.getByDisplayValue("voice text")).toBeInTheDocument();
+  });
+
  it("renders and changes workspace access mode", async () => {
    const onWorkspaceScopeChange = vi.fn();
    render(