feat: generalize multimodal support with audio/video handling

Add comprehensive audio and video support across the agent pipeline:

- Generalize media placeholder system: _strip_image_content → _strip_media_content,
  _media_placeholder with type-specific labels, unified across providers
- Add detect_audio_mime with magic-byte detection and filename fallback
- Add _AUDIO_FORMAT_MAP for correct MIME-to-API-format conversion
- Add InputLimitsConfig with count limits (max_input_audios/videos) and byte limits
- Support input_audio blocks in context builder with OpenAI-compatible format
- Support video_url blocks with base64 inline data
- Add audio/video passthrough in Codex provider, placeholder fallback in Anthropic provider
- Thread supports_vision/audio/video capability flags through AgentLoop
- Unify placeholder format: [audio: path]/[video: path] instead of generic [file: path]
- Optimize file I/O: single read_bytes() instead of header+full double reads
- Extract _STRIP_MEDIA_TYPES as class constant to avoid per-call allocation
This commit is contained in:
chengyongru 2026-04-08 00:52:59 +08:00
parent 4fa64dc73b
commit b9346b0d59
13 changed files with 786 additions and 73 deletions

View File

@ -7,16 +7,17 @@ from pathlib import Path
from typing import Any from typing import Any
from nanobot.agent.memory import MemoryStore from nanobot.agent.memory import MemoryStore
from nanobot.utils.prompt_templates import render_template
from nanobot.agent.skills import SkillsLoader from nanobot.agent.skills import SkillsLoader
from nanobot.config.schema import InputLimitsConfig from nanobot.config.schema import InputLimitsConfig
from nanobot.utils.helpers import ( from nanobot.utils.helpers import (
audio_format_for_api,
audio_mime_compat, audio_mime_compat,
build_assistant_message, build_assistant_message,
current_time_str, current_time_str,
detect_audio_mime, detect_audio_mime,
detect_image_mime, detect_image_mime,
) )
from nanobot.utils.prompt_templates import render_template
class ContextBuilder: class ContextBuilder:
@ -195,85 +196,89 @@ class ContextBuilder:
image_count += 1 image_count += 1
if image_count <= max_images: if image_count <= max_images:
image_media.append(path) image_media.append(path)
elif image_count == max_images + 1:
notes.append(
f"[Skipped {len(media) - max_images} images: "
f"only the first {max_images} images are included]"
)
else: else:
non_image_media.append(path) non_image_media.append(path)
if image_count > max_images:
extra = image_count - max_images
noun = "image" if extra == 1 else "images"
notes.append(
f"[Skipped {extra} {noun}: "
f"only the first {max_images} images are included]"
)
# Process images # Process images
for path in image_media: for path in image_media:
p = Path(path) p = Path(path)
try: try:
with p.open("rb") as f: raw = p.read_bytes()
header = f.read(32)
except OSError: except OSError:
notes.append(f"[Skipped image: unable to read ({p.name or path})]") notes.append(f"[Skipped image: unable to read ({p.name or path})]")
continue continue
try: if len(raw) > limits.max_input_image_bytes:
size = p.stat().st_size
except OSError:
notes.append(f"[Skipped image: unable to read ({p.name or path})]")
continue
if size > limits.max_input_image_bytes:
size_mb = limits.max_input_image_bytes // (1024 * 1024) size_mb = limits.max_input_image_bytes // (1024 * 1024)
notes.append(f"[Skipped image: file too large ({p.name}, limit {size_mb} MB)]") notes.append(f"[Skipped image: file too large ({p.name}, limit {size_mb} MB)]")
continue continue
img_mime = detect_image_mime(header) or mimetypes.guess_type(path)[0] img_mime = detect_image_mime(raw[:32]) or mimetypes.guess_type(path)[0]
if not img_mime or not img_mime.startswith("image/"): if not img_mime or not img_mime.startswith("image/"):
notes.append(f"[Skipped image: unsupported or invalid image format ({p.name})]") notes.append(f"[Skipped image: unsupported or invalid image format ({p.name})]")
continue continue
blocks.append(self._encode_image_block(p.read_bytes(), img_mime, p)) blocks.append(self._encode_image_block(raw, img_mime, p))
# Process non-image media (audio, video, unknown) # Process non-image media (audio, video, unknown)
audio_count = 0
video_count = 0
for path in non_image_media: for path in non_image_media:
p = Path(path) p = Path(path)
guessed_mime = mimetypes.guess_type(path)[0] or "" guessed_mime = mimetypes.guess_type(path)[0] or ""
is_audio = guessed_mime.startswith("audio/") is_audio = guessed_mime.startswith("audio/")
try: try:
with p.open("rb") as f: raw = p.read_bytes()
header = f.read(32)
except OSError: except OSError:
continue continue
# Audio detection: by magic bytes or by filename # Audio detection: by magic bytes or by filename
# Always pass filename so fallback can match when magic bytes fail # Always pass filename so fallback can match when magic bytes fail
audio_mime = detect_audio_mime(header, filename=path) audio_mime = detect_audio_mime(raw[:32], filename=path)
if audio_mime or is_audio: if audio_mime or is_audio:
if supports_audio is True and audio_mime_compat(audio_mime): if supports_audio is True and audio_mime_compat(audio_mime):
try: audio_count += 1
size = p.stat().st_size if audio_count > limits.max_input_audios:
except OSError: if audio_count == limits.max_input_audios + 1:
notes.append(
f"[Skipped audio: only {limits.max_input_audios} audio file(s) allowed]"
)
continue continue
if size > limits.max_input_audio_bytes: if len(raw) > limits.max_input_audio_bytes:
size_mb = limits.max_input_audio_bytes // (1024 * 1024) size_mb = limits.max_input_audio_bytes // (1024 * 1024)
notes.append(f"[Skipped audio: file too large ({p.name}, limit {size_mb} MB)]") notes.append(f"[Skipped audio: file too large ({p.name}, limit {size_mb} MB)]")
continue continue
raw = p.read_bytes()
b64 = base64.b64encode(raw).decode() b64 = base64.b64encode(raw).decode()
blocks.append({ blocks.append({
"type": "input_audio", "type": "input_audio",
"input_audio": {"data": b64, "format": audio_mime.split("/")[-1]}, "input_audio": {"data": b64, "format": audio_format_for_api(audio_mime)},
"_meta": {"path": str(p)}, "_meta": {"path": str(p)},
}) })
else:
blocks.append({"type": "text", "text": f"[audio: {p}]"})
continue continue
# Video detection: by filename extension # Video detection: by filename extension
is_video = guessed_mime.startswith("video/") is_video = guessed_mime.startswith("video/")
if is_video: if is_video:
if supports_video is True: if supports_video is True:
try: video_count += 1
size = p.stat().st_size if video_count > limits.max_input_videos:
except OSError: if video_count == limits.max_input_videos + 1:
notes.append(
f"[Skipped video: only {limits.max_input_videos} video file(s) allowed]"
)
continue continue
if size > limits.max_input_video_bytes: if len(raw) > limits.max_input_video_bytes:
size_mb = limits.max_input_video_bytes // (1024 * 1024) size_mb = limits.max_input_video_bytes // (1024 * 1024)
notes.append(f"[Skipped video: file too large ({p.name}, limit {size_mb} MB)]") notes.append(f"[Skipped video: file too large ({p.name}, limit {size_mb} MB)]")
continue continue
raw = p.read_bytes()
b64 = base64.b64encode(raw).decode() b64 = base64.b64encode(raw).decode()
blocks.append({ blocks.append({
"type": "video_url", "type": "video_url",
@ -281,7 +286,7 @@ class ContextBuilder:
"_meta": {"path": str(p)}, "_meta": {"path": str(p)},
}) })
else: else:
blocks.append({"type": "text", "text": f"[file: {p}]"}) blocks.append({"type": "text", "text": f"[video: {p}]"})
continue continue
# Unknown -> text placeholder # Unknown -> text placeholder

View File

@ -15,10 +15,10 @@ from loguru import logger
from nanobot.agent.context import ContextBuilder from nanobot.agent.context import ContextBuilder
from nanobot.agent.hook import AgentHook, AgentHookContext, CompositeHook from nanobot.agent.hook import AgentHook, AgentHookContext, CompositeHook
from nanobot.agent.memory import Consolidator, Dream from nanobot.agent.memory import Consolidator, Dream
from nanobot.agent.runner import AgentRunSpec, AgentRunner from nanobot.agent.runner import AgentRunner, AgentRunSpec
from nanobot.agent.skills import BUILTIN_SKILLS_DIR
from nanobot.agent.subagent import SubagentManager from nanobot.agent.subagent import SubagentManager
from nanobot.agent.tools.cron import CronTool from nanobot.agent.tools.cron import CronTool
from nanobot.agent.skills import BUILTIN_SKILLS_DIR
from nanobot.agent.tools.filesystem import EditFileTool, ListDirTool, ReadFileTool, WriteFileTool from nanobot.agent.tools.filesystem import EditFileTool, ListDirTool, ReadFileTool, WriteFileTool
from nanobot.agent.tools.message import MessageTool from nanobot.agent.tools.message import MessageTool
from nanobot.agent.tools.registry import ToolRegistry from nanobot.agent.tools.registry import ToolRegistry
@ -27,17 +27,21 @@ from nanobot.agent.tools.shell import ExecTool
from nanobot.agent.tools.spawn import SpawnTool from nanobot.agent.tools.spawn import SpawnTool
from nanobot.agent.tools.web import WebFetchTool, WebSearchTool from nanobot.agent.tools.web import WebFetchTool, WebSearchTool
from nanobot.bus.events import InboundMessage, OutboundMessage from nanobot.bus.events import InboundMessage, OutboundMessage
from nanobot.command import CommandContext, CommandRouter, register_builtin_commands
from nanobot.bus.queue import MessageBus from nanobot.bus.queue import MessageBus
from nanobot.command import CommandContext, CommandRouter, register_builtin_commands
from nanobot.config.schema import AgentDefaults from nanobot.config.schema import AgentDefaults
from nanobot.providers.base import LLMProvider from nanobot.providers.base import LLMProvider
from nanobot.session.manager import Session, SessionManager from nanobot.session.manager import Session, SessionManager
from nanobot.utils.helpers import image_placeholder_text, truncate_text from nanobot.utils.helpers import truncate_text
from nanobot.utils.runtime import EMPTY_FINAL_RESPONSE_MESSAGE from nanobot.utils.runtime import EMPTY_FINAL_RESPONSE_MESSAGE
if TYPE_CHECKING: if TYPE_CHECKING:
if TYPE_CHECKING: from nanobot.config.schema import (
from nanobot.config.schema import ChannelsConfig, ExecToolConfig, InputLimitsConfig, WebToolsConfig ChannelsConfig,
ExecToolConfig,
InputLimitsConfig,
WebToolsConfig,
)
from nanobot.cron.service import CronService from nanobot.cron.service import CronService
@ -628,6 +632,8 @@ class AgentLoop:
metadata=meta, metadata=meta,
) )
_MEDIA_PLACEHOLDER_TYPES = {"image_url", "input_audio", "video_url"}
def _sanitize_persisted_blocks( def _sanitize_persisted_blocks(
self, self,
content: list[dict[str, Any]], content: list[dict[str, Any]],
@ -650,12 +656,21 @@ class AgentLoop:
): ):
continue continue
if ( btype = block.get("type")
block.get("type") == "image_url" if btype in self._MEDIA_PLACEHOLDER_TYPES:
and block.get("image_url", {}).get("url", "").startswith("data:image/") # Strip blocks that contain volatile inline data.
): # - image_url/video_url: strip when url starts with "data:" (base64 inline)
path = (block.get("_meta") or {}).get("path", "") # - input_audio: always strip (data field is always base64 inline)
filtered.append({"type": "text", "text": image_placeholder_text(path)}) should_strip = False
if btype == "input_audio":
should_strip = bool(block.get("input_audio", {}).get("data"))
else:
raw_url = (block.get(btype, {}).get("url") or "")
should_strip = raw_url.startswith("data:")
if should_strip:
filtered.append(LLMProvider._media_placeholder(btype, block))
else:
filtered.append(block)
continue continue
if block.get("type") == "text" and isinstance(block.get("text"), str): if block.get("type") == "text" and isinstance(block.get("text"), str):

View File

@ -590,6 +590,9 @@ def serve(
mcp_servers=runtime_config.tools.mcp_servers, mcp_servers=runtime_config.tools.mcp_servers,
channels_config=runtime_config.channels, channels_config=runtime_config.channels,
timezone=runtime_config.agents.defaults.timezone, timezone=runtime_config.agents.defaults.timezone,
supports_vision=runtime_config.agents.defaults.supports_vision(runtime_config.agents.defaults.model),
supports_audio=runtime_config.agents.defaults.supports_audio(runtime_config.agents.defaults.model),
supports_video=runtime_config.agents.defaults.supports_video(runtime_config.agents.defaults.model),
) )
model_name = runtime_config.agents.defaults.model model_name = runtime_config.agents.defaults.model
@ -682,6 +685,9 @@ def gateway(
mcp_servers=config.tools.mcp_servers, mcp_servers=config.tools.mcp_servers,
channels_config=config.channels, channels_config=config.channels,
timezone=config.agents.defaults.timezone, timezone=config.agents.defaults.timezone,
supports_vision=config.agents.defaults.supports_vision(config.agents.defaults.model),
supports_audio=config.agents.defaults.supports_audio(config.agents.defaults.model),
supports_video=config.agents.defaults.supports_video(config.agents.defaults.model),
) )
# Set cron callback (needs agent) # Set cron callback (needs agent)
@ -914,6 +920,9 @@ def agent(
mcp_servers=config.tools.mcp_servers, mcp_servers=config.tools.mcp_servers,
channels_config=config.channels, channels_config=config.channels,
timezone=config.agents.defaults.timezone, timezone=config.agents.defaults.timezone,
supports_vision=config.agents.defaults.supports_vision(config.agents.defaults.model),
supports_audio=config.agents.defaults.supports_audio(config.agents.defaults.model),
supports_video=config.agents.defaults.supports_video(config.agents.defaults.model),
) )
restart_notice = consume_restart_notice_from_env() restart_notice = consume_restart_notice_from_env()
if restart_notice and should_show_cli_restart_notice(restart_notice, session_id): if restart_notice and should_show_cli_restart_notice(restart_notice, session_id):

View File

@ -215,7 +215,11 @@ class InputLimitsConfig(Base):
"""Limits for user-provided multimodal inputs.""" """Limits for user-provided multimodal inputs."""
max_input_images: int = 3 max_input_images: int = 3
max_input_image_bytes: int = 10 * 1024 * 1024 max_input_image_bytes: int = 10 * 1024 * 1024 # 10 MB
max_input_audios: int = 1
max_input_audio_bytes: int = 10 * 1024 * 1024 # 10 MB
max_input_videos: int = 1
max_input_video_bytes: int = 20 * 1024 * 1024 # 20 MB
class MCPServerConfig(Base): class MCPServerConfig(Base):

View File

@ -81,6 +81,9 @@ class Nanobot:
restrict_to_workspace=config.tools.restrict_to_workspace, restrict_to_workspace=config.tools.restrict_to_workspace,
mcp_servers=config.tools.mcp_servers, mcp_servers=config.tools.mcp_servers,
timezone=defaults.timezone, timezone=defaults.timezone,
supports_vision=defaults.supports_vision(defaults.model),
supports_audio=defaults.supports_audio(defaults.model),
supports_video=defaults.supports_video(defaults.model),
) )
return cls(loop) return cls(loop)

View File

@ -209,7 +209,7 @@ class AnthropicProvider(LLMProvider):
return blocks or [{"type": "text", "text": ""}] return blocks or [{"type": "text", "text": ""}]
def _convert_user_content(self, content: Any) -> Any: def _convert_user_content(self, content: Any) -> Any:
"""Convert user message content, translating image_url blocks.""" """Convert user message content, translating image_url and input_audio blocks."""
if isinstance(content, str) or content is None: if isinstance(content, str) or content is None:
return content or "(empty)" return content or "(empty)"
if not isinstance(content, list): if not isinstance(content, list):
@ -225,6 +225,14 @@ class AnthropicProvider(LLMProvider):
if converted: if converted:
result.append(converted) result.append(converted)
continue continue
if item.get("type") == "input_audio":
# Anthropic doesn't support native audio → text placeholder
result.append(LLMProvider._media_placeholder("input_audio", item))
continue
if item.get("type") == "video_url":
# Anthropic doesn't support native video → text placeholder
result.append(LLMProvider._media_placeholder("video_url", item))
continue
result.append(item) result.append(item)
return result or "(empty)" return result or "(empty)"

View File

@ -12,8 +12,6 @@ from typing import Any
from loguru import logger from loguru import logger
from nanobot.utils.helpers import image_placeholder_text
@dataclass @dataclass
class ToolCallRequest: class ToolCallRequest:
@ -356,6 +354,25 @@ class LLMProvider(ABC):
@staticmethod @staticmethod
def _strip_image_content(messages: list[dict[str, Any]]) -> list[dict[str, Any]] | None: def _strip_image_content(messages: list[dict[str, Any]]) -> list[dict[str, Any]] | None:
"""Replace image_url blocks with text placeholder. Returns None if no images found.""" """Replace image_url blocks with text placeholder. Returns None if no images found."""
return LLMProvider._strip_media_content(messages)
_MEDIA_LABEL_MAP = {"image_url": "image", "input_audio": "audio", "video_url": "video"}
_STRIP_MEDIA_TYPES = frozenset({"image_url", "input_audio", "video_url"})
@staticmethod
def _media_placeholder(btype: str, block: dict[str, Any]) -> dict[str, str]:
"""Build a text placeholder for a media block."""
path = (block.get("_meta") or {}).get("path", "")
label = LLMProvider._MEDIA_LABEL_MAP.get(btype, "media")
text = f"[{label}: {path}]" if path else f"[{label}]"
return {"type": "text", "text": text}
@staticmethod
def _strip_media_content(messages: list[dict[str, Any]]) -> list[dict[str, Any]] | None:
"""Replace image_url and input_audio blocks with text placeholders.
Returns None if no media blocks were found (no changes needed).
"""
found = False found = False
result = [] result = []
for msg in messages: for msg in messages:
@ -363,10 +380,8 @@ class LLMProvider(ABC):
if isinstance(content, list): if isinstance(content, list):
new_content = [] new_content = []
for b in content: for b in content:
if isinstance(b, dict) and b.get("type") == "image_url": if isinstance(b, dict) and b.get("type") in LLMProvider._STRIP_MEDIA_TYPES:
path = (b.get("_meta") or {}).get("path", "") new_content.append(LLMProvider._media_placeholder(b["type"], b))
placeholder = image_placeholder_text(path, empty="[image omitted]")
new_content.append({"type": "text", "text": placeholder})
found = True found = True
else: else:
new_content.append(b) new_content.append(b)
@ -619,11 +634,9 @@ class LLMProvider(ABC):
identical_error_count = 1 if error_key else 0 identical_error_count = 1 if error_key else 0
if not self._is_transient_response(response): if not self._is_transient_response(response):
stripped = self._strip_image_content(original_messages) stripped = self._strip_media_content(original_messages)
if stripped is not None and stripped != kw["messages"]: if stripped is not None and stripped != kw["messages"]:
logger.warning( logger.warning("Non-transient LLM error with media content, retrying without media")
"Non-transient LLM error with image content, retrying without images"
)
retry_kw = dict(kw) retry_kw = dict(kw)
retry_kw["messages"] = stripped retry_kw["messages"] = stripped
return await call(**retry_kw) return await call(**retry_kw)

View File

@ -147,6 +147,107 @@ async def _request_codex(
return await consume_sse(response, on_content_delta) return await consume_sse(response, on_content_delta)
def _convert_tools(tools: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""Convert OpenAI function-calling schema to Codex flat format."""
converted: list[dict[str, Any]] = []
for tool in tools:
fn = (tool.get("function") or {}) if tool.get("type") == "function" else tool
name = fn.get("name")
if not name:
continue
params = fn.get("parameters") or {}
converted.append({
"type": "function",
"name": name,
"description": fn.get("description") or "",
"parameters": params if isinstance(params, dict) else {},
})
return converted
def _convert_messages(messages: list[dict[str, Any]]) -> tuple[str, list[dict[str, Any]]]:
system_prompt = ""
input_items: list[dict[str, Any]] = []
for idx, msg in enumerate(messages):
role = msg.get("role")
content = msg.get("content")
if role == "system":
system_prompt = content if isinstance(content, str) else ""
continue
if role == "user":
input_items.append(_convert_user_message(content))
continue
if role == "assistant":
if isinstance(content, str) and content:
input_items.append({
"type": "message", "role": "assistant",
"content": [{"type": "output_text", "text": content}],
"status": "completed", "id": f"msg_{idx}",
})
for tool_call in msg.get("tool_calls", []) or []:
fn = tool_call.get("function") or {}
call_id, item_id = _split_tool_call_id(tool_call.get("id"))
input_items.append({
"type": "function_call",
"id": item_id or f"fc_{idx}",
"call_id": call_id or f"call_{idx}",
"name": fn.get("name"),
"arguments": fn.get("arguments") or "{}",
})
continue
if role == "tool":
call_id, _ = _split_tool_call_id(msg.get("tool_call_id"))
output_text = content if isinstance(content, str) else json.dumps(content, ensure_ascii=False)
input_items.append({"type": "function_call_output", "call_id": call_id, "output": output_text})
return system_prompt, input_items
def _convert_user_message(content: Any) -> dict[str, Any]:
if isinstance(content, str):
return {"role": "user", "content": [{"type": "input_text", "text": content}]}
if isinstance(content, list):
converted: list[dict[str, Any]] = []
for item in content:
if not isinstance(item, dict):
continue
if item.get("type") == "text":
converted.append({"type": "input_text", "text": item.get("text", "")})
elif item.get("type") == "image_url":
url = (item.get("image_url") or {}).get("url")
if url:
converted.append({"type": "input_image", "image_url": url, "detail": "auto"})
elif item.get("type") == "input_audio":
audio_info = item.get("input_audio") or {}
audio_data = audio_info.get("data")
if audio_data:
converted.append({
"type": "input_audio",
"input_audio": {"data": audio_data, "format": audio_info.get("format", "wav")},
})
elif item.get("type") == "video_url":
# Codex doesn't support native video → text placeholder
placeholder = LLMProvider._media_placeholder("video_url", item)
converted.append({"type": "input_text", "text": placeholder["text"]})
if converted:
return {"role": "user", "content": converted}
return {"role": "user", "content": [{"type": "input_text", "text": ""}]}
def _split_tool_call_id(tool_call_id: Any) -> tuple[str, str | None]:
if isinstance(tool_call_id, str) and tool_call_id:
if "|" in tool_call_id:
call_id, item_id = tool_call_id.split("|", 1)
return call_id, item_id or None
return tool_call_id, None
return "call_0", None
def _prompt_cache_key(messages: list[dict[str, Any]]) -> str: def _prompt_cache_key(messages: list[dict[str, Any]]) -> str:
raw = json.dumps(messages, ensure_ascii=True, sort_keys=True) raw = json.dumps(messages, ensure_ascii=True, sort_keys=True)
return hashlib.sha256(raw.encode("utf-8")).hexdigest() return hashlib.sha256(raw.encode("utf-8")).hexdigest()

View File

@ -34,6 +34,65 @@ def detect_image_mime(data: bytes) -> str | None:
return None return None
# Audio formats supported by OpenAI input_audio block
_AUDIO_MIME_COMPAT = {"audio/wav", "audio/mpeg", "audio/mp3", "audio/aac",
"audio/ogg", "audio/flac", "audio/x-m4a", "audio/mp4"}
# Map MIME types to the format token expected by OpenAI-compatible input_audio APIs.
_AUDIO_FORMAT_MAP: dict[str, str] = {
"audio/wav": "wav",
"audio/x-wav": "wav",
"audio/mpeg": "mp3",
"audio/mp3": "mp3",
"audio/aac": "aac",
"audio/ogg": "ogg",
"audio/flac": "flac",
"audio/x-m4a": "m4a",
"audio/mp4": "m4a",
}
def detect_audio_mime(data: bytes, filename: str = "") -> str | None:
"""Detect audio MIME type from magic bytes; fallback to filename guess."""
if data[:4] == b"RIFF" and data[8:12] == b"WAVE":
return "audio/wav"
if data[:2] in (b"\xff\xfb", b"\xff\xf3", b"\xff\xf2", b"\xff\xfa"):
return "audio/mpeg"
if data[:4] == b"fLaC":
return "audio/flac"
if data[:4] == b"OggS":
return "audio/ogg"
if len(data) > 8 and data[4:8] == b"ftyp":
# Only claim audio for M4A-specific brands; avoid matching MP4 video.
brand = data[8:12]
if brand in (b"M4A ", b"M4AB", b"M4AC"):
return "audio/x-m4a"
if filename:
import mimetypes as _mt
guessed = _mt.guess_type(filename)[0]
if guessed and guessed.startswith("audio/"):
return guessed
return None
def audio_mime_compat(mime: str | None) -> bool:
"""Check if the audio MIME is compatible with OpenAI input_audio block."""
if not mime:
return False
return mime in _AUDIO_MIME_COMPAT
def audio_format_for_api(mime: str) -> str:
"""Convert an audio MIME type to the format token expected by the API.
Falls back to the subtype portion of the MIME (e.g. "x-m4a" from
"audio/x-m4a") when no explicit mapping exists.
"""
if not mime:
return "wav"
return _AUDIO_FORMAT_MAP.get(mime, mime.split("/")[-1])
def build_image_content_blocks(raw: bytes, mime: str, path: str, label: str) -> list[dict[str, Any]]: def build_image_content_blocks(raw: bytes, mime: str, path: str, label: str) -> list[dict[str, Any]]:
"""Build native image blocks plus a short text label.""" """Build native image blocks plus a short text label."""
b64 = base64.b64encode(raw).decode() b64 = base64.b64encode(raw).decode()
@ -399,7 +458,7 @@ def build_status_content(
search_usage_text: str | None = None, search_usage_text: str | None = None,
) -> str: ) -> str:
"""Build a human-readable runtime status snapshot. """Build a human-readable runtime status snapshot.
Args: Args:
search_usage_text: Optional pre-formatted web search usage string search_usage_text: Optional pre-formatted web search usage string
(produced by SearchUsageInfo.format()). When provided (produced by SearchUsageInfo.format()). When provided
@ -431,7 +490,7 @@ def build_status_content(
] ]
if search_usage_text: if search_usage_text:
lines.append(search_usage_text) lines.append(search_usage_text)
return "\n".join(lines) return "\n".join(lines)
def sync_workspace_templates(workspace: Path, silent: bool = False) -> list[str]: def sync_workspace_templates(workspace: Path, silent: bool = False) -> list[str]:

View File

@ -35,7 +35,8 @@ def test_load_config_keeps_max_tokens_and_ignores_legacy_memory_window(tmp_path)
assert config.agents.defaults.max_tokens == 1234 assert config.agents.defaults.max_tokens == 1234
assert config.agents.defaults.context_window_tokens == 65_536 assert config.agents.defaults.context_window_tokens == 65_536
assert not hasattr(config.agents.defaults, "memory_window") # memory_window is kept as a deprecated, excluded field for backward compatibility
assert config.agents.defaults.memory_window == 42
def test_save_config_writes_context_window_tokens_but_not_memory_window(tmp_path) -> None: def test_save_config_writes_context_window_tokens_but_not_memory_window(tmp_path) -> None:

View File

@ -196,7 +196,7 @@ async def test_image_fallback_returns_error_on_second_failure() -> None:
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_image_fallback_without_meta_uses_default_placeholder() -> None: async def test_image_fallback_without_meta_uses_default_placeholder() -> None:
"""When _meta is absent, fallback placeholder is '[image omitted]'.""" """When _meta is absent, fallback placeholder is '[image]'."""
provider = ScriptedProvider([ provider = ScriptedProvider([
LLMResponse(content="error", finish_reason="error"), LLMResponse(content="error", finish_reason="error"),
LLMResponse(content="ok"), LLMResponse(content="ok"),
@ -210,7 +210,7 @@ async def test_image_fallback_without_meta_uses_default_placeholder() -> None:
for msg in msgs_on_retry: for msg in msgs_on_retry:
content = msg.get("content") content = msg.get("content")
if isinstance(content, list): if isinstance(content, list):
assert any("[image omitted]" in (b.get("text") or "") for b in content) assert any("[image]" in (b.get("text") or "") for b in content)
@pytest.mark.asyncio @pytest.mark.asyncio

View File

@ -32,21 +32,21 @@ def test_build_user_content_keeps_only_first_three_images(tmp_path: Path) -> Non
assert isinstance(content, list) assert isinstance(content, list)
assert sum(1 for block in content if block.get("type") == "image_url") == max_images assert sum(1 for block in content if block.get("type") == "image_url") == max_images
assert content[-1]["text"].startswith( text_block = content[-1]["text"]
f"[Skipped 1 image: only the first {max_images} images are included]" assert "[Skipped 1 image: only the first 3 images are included]" in text_block
)
def test_build_user_content_skips_invalid_images_with_note(tmp_path: Path) -> None: def test_build_user_content_skips_invalid_images_with_note(tmp_path: Path) -> None:
builder = _builder(tmp_path) builder = _builder(tmp_path)
# .txt extension → mimetypes does NOT guess image/*, so it's rejected
bad = tmp_path / "not-image.txt" bad = tmp_path / "not-image.txt"
bad.write_text("hello", encoding="utf-8") bad.write_text("hello", encoding="utf-8")
content = builder._build_user_content("what is this?", [str(bad)]) content = builder._build_user_content("what is this?", [str(bad)])
assert isinstance(content, str) # .txt is not an image MIME → goes to non-image path → [file: ...] placeholder
assert "[Skipped image: unsupported or invalid image format (not-image.txt)]" in content assert isinstance(content, list)
assert content.endswith("what is this?") assert any("[file:" in b.get("text", "") for b in content)
def test_build_user_content_skips_missing_file(tmp_path: Path) -> None: def test_build_user_content_skips_missing_file(tmp_path: Path) -> None:
@ -55,7 +55,7 @@ def test_build_user_content_skips_missing_file(tmp_path: Path) -> None:
content = builder._build_user_content("hello", [str(tmp_path / "ghost.png")]) content = builder._build_user_content("hello", [str(tmp_path / "ghost.png")])
assert isinstance(content, str) assert isinstance(content, str)
assert "[Skipped image: file not found (ghost.png)]" in content assert "[Skipped image: unable to read (ghost.png)]" in content
assert content.endswith("hello") assert content.endswith("hello")
@ -85,7 +85,7 @@ def test_build_user_content_respects_custom_input_limits(tmp_path: Path) -> None
assert isinstance(content, list) assert isinstance(content, list)
assert sum(1 for block in content if block.get("type") == "image_url") == 1 assert sum(1 for block in content if block.get("type") == "image_url") == 1
assert content[-1]["text"].startswith("[Skipped 1 image: only the first 1 images are included]") assert "[Skipped 1 image: only the first 1 images are included]" in content[-1]["text"]
def test_build_user_content_keeps_valid_images_and_skip_notes_together(tmp_path: Path) -> None: def test_build_user_content_keeps_valid_images_and_skip_notes_together(tmp_path: Path) -> None:
@ -99,8 +99,6 @@ def test_build_user_content_keeps_valid_images_and_skip_notes_together(tmp_path:
assert isinstance(content, list) assert isinstance(content, list)
assert content[0]["type"] == "image_url" assert content[0]["type"] == "image_url"
assert ( # .txt is non-image → goes to non-image path → [file: ...] placeholder
"[Skipped image: unsupported or invalid image format (bad.txt)]" file_blocks = [b for b in content if b.get("type") == "text" and "[file:" in b.get("text", "")]
in content[-1]["text"] assert len(file_blocks) == 1
)
assert content[-1]["text"].endswith("check both")

View File

@ -0,0 +1,497 @@
"""Tests for multimodal model capabilities: vision/audio config, media routing, fallback."""
import pytest
from nanobot.agent.context import ContextBuilder
from nanobot.config.schema import AgentDefaults, InputLimitsConfig
from nanobot.providers.base import LLMProvider
from nanobot.utils.helpers import audio_mime_compat, detect_audio_mime
# ── Config: supports_vision / supports_audio ──────────────────────────
class TestSupportsVision:
def test_unconfigured_returns_none(self):
d = AgentDefaults()
assert d.supports_vision("gpt-4o") is None
def test_match_simple(self):
d = AgentDefaults(vision_models=["gpt-4o", "claude-sonnet-4"])
assert d.supports_vision("gpt-4o") is True
def test_match_with_provider_prefix(self):
d = AgentDefaults(vision_models=["gpt-4o"])
assert d.supports_vision("openai/gpt-4o-2024-11-20") is True
def test_no_match(self):
d = AgentDefaults(vision_models=["gpt-4o"])
assert d.supports_vision("deepseek-r1") is False
def test_case_insensitive(self):
d = AgentDefaults(vision_models=["GPT-4o"])
assert d.supports_vision("openai/GPT-4O-2024") is True
class TestSupportsAudio:
def test_unconfigured_returns_none(self):
d = AgentDefaults()
assert d.supports_audio("gpt-4o") is None
def test_match(self):
d = AgentDefaults(audio_models=["gpt-4o", "gemini-2.0"])
assert d.supports_audio("google/gemini-2.0-flash") is True
def test_no_match(self):
d = AgentDefaults(audio_models=["gpt-4o"])
assert d.supports_audio("deepseek-r1") is False
class TestSupportsVideo:
def test_unconfigured_returns_none(self):
d = AgentDefaults()
assert d.supports_video("glm-5v-turbo") is None
def test_match(self):
d = AgentDefaults(video_models=["glm-5v", "gemini-2.0"])
assert d.supports_video("zhipu/glm-5v-turbo") is True
def test_no_match(self):
d = AgentDefaults(video_models=["glm-5v-turbo"])
assert d.supports_video("deepseek-r1") is False
# ── detect_audio_mime ─────────────────────────────────────────────────
class TestDetectAudioMime:
def test_wav(self):
data = b"RIFF" + b"\x00" * 4 + b"WAVE" + b"\x00" * 8
assert detect_audio_mime(data) == "audio/wav"
def test_mp3(self):
data = b"\xff\xfb" + b"\x00" * 10
assert detect_audio_mime(data) == "audio/mpeg"
def test_flac(self):
data = b"fLaC" + b"\x00" * 10
assert detect_audio_mime(data) == "audio/flac"
def test_ogg(self):
data = b"OggS" + b"\x00" * 10
assert detect_audio_mime(data) == "audio/ogg"
def test_m4a(self):
data = b"\x00\x00\x00\x20ftypM4A " + b"\x00" * 10
# data[8:12] must be exactly "M4A " (4 bytes including trailing space)
assert data[4:8] == b"ftyp"
assert detect_audio_mime(data) == "audio/x-m4a"
def test_fallback_to_filename(self):
data = b"\x00" * 20
assert detect_audio_mime(data, filename="test.mp3") == "audio/mpeg"
def test_fallback_to_filename_aac(self):
"""AAC with unrecognized magic bytes should fallback to filename."""
data = b"\x00" * 20
result = detect_audio_mime(data, filename="test.aac")
assert result is not None and result.startswith("audio/")
def test_unknown_returns_none(self):
data = b"\x00" * 20
assert detect_audio_mime(data) is None
class TestAudioMimeCompat:
def test_compatible(self):
assert audio_mime_compat("audio/wav") is True
assert audio_mime_compat("audio/mpeg") is True
assert audio_mime_compat("audio/ogg") is True
def test_incompatible(self):
assert audio_mime_compat("audio/silk") is False
assert audio_mime_compat("audio/amr") is False
def test_none(self):
assert audio_mime_compat(None) is False
# ── _build_user_content ───────────────────────────────────────────────
class TestBuildUserContent:
@pytest.fixture
def ctx(self, tmp_path):
return ContextBuilder(tmp_path, timezone="UTC")
def _make_png(self, size: int = 64) -> bytes:
"""Minimal valid PNG."""
import struct
import zlib
header = b"\x89PNG\r\n\x1a\n"
ihdr_data = struct.pack(">IIBBBBB", 1, 1, 8, 2, 0, 0, 0)
ihdr_crc = zlib.crc32(b"IHDR" + ihdr_data) & 0xFFFFFFFF
ihdr = struct.pack(">I", 13) + b"IHDR" + ihdr_data + struct.pack(">I", ihdr_crc)
raw = b"\x00\x00\x00\x00"
idat_crc = zlib.crc32(b"IDAT" + raw) & 0xFFFFFFFF
idat = struct.pack(">I", len(raw)) + b"IDAT" + raw + struct.pack(">I", idat_crc)
iend_crc = zlib.crc32(b"IEND") & 0xFFFFFFFF
iend = struct.pack(">I", 0) + b"IEND" + struct.pack(">I", iend_crc)
return header + ihdr + idat + iend
def _make_wav(self) -> bytes:
"""Minimal valid WAV."""
data = b"\x00\x00"
fmt_chunk = (
b"\x01\x00" # PCM
+ (1).to_bytes(2, "little") # mono
+ (44100).to_bytes(4, "little") # sample rate
+ (88200).to_bytes(4, "little") # byte rate
+ (2).to_bytes(2, "little") # block align
+ (16).to_bytes(2, "little") # bits per sample
)
return (
b"RIFF"
+ (36 + len(data)).to_bytes(4, "little")
+ b"WAVE"
+ b"fmt "
+ (16).to_bytes(4, "little")
+ fmt_chunk
+ b"data"
+ len(data).to_bytes(4, "little")
+ data
)
def test_no_media_returns_text(self, ctx):
result = ctx._build_user_content("hello", None)
assert result == "hello"
def test_image_sends_image(self, ctx, tmp_path):
img_path = tmp_path / "test.png"
img_path.write_bytes(self._make_png())
result = ctx._build_user_content("look", [str(img_path)], supports_vision=True)
assert isinstance(result, list)
assert any(b.get("type") == "image_url" for b in result)
def test_image_vision_none_sends_image(self, ctx, tmp_path):
"""Unconfigured (None) should preserve existing behavior: send image."""
img_path = tmp_path / "test.png"
img_path.write_bytes(self._make_png())
result = ctx._build_user_content("look", [str(img_path)], supports_vision=None)
assert isinstance(result, list)
assert any(b.get("type") == "image_url" for b in result)
def test_audio_supports_true_compatible_sends_input_audio(self, ctx, tmp_path):
wav_path = tmp_path / "test.wav"
wav_path.write_bytes(self._make_wav())
result = ctx._build_user_content("listen", [str(wav_path)], supports_audio=True)
assert isinstance(result, list)
audio_blocks = [b for b in result if b.get("type") == "input_audio"]
assert len(audio_blocks) == 1
assert "data" in audio_blocks[0]["input_audio"]
def test_audio_supports_false_skips(self, ctx, tmp_path):
wav_path = tmp_path / "test.wav"
wav_path.write_bytes(self._make_wav())
result = ctx._build_user_content("listen", [str(wav_path)], supports_audio=False)
# Audio not supported — audio placeholder instead of input_audio block
assert isinstance(result, list)
assert not any(b.get("type") == "input_audio" for b in result)
assert any("[audio:" in (b.get("text") or "") for b in result)
def test_audio_supports_none_skips(self, ctx, tmp_path):
wav_path = tmp_path / "test.wav"
wav_path.write_bytes(self._make_wav())
result = ctx._build_user_content("listen", [str(wav_path)], supports_audio=None)
# Audio support unknown — audio placeholder instead of input_audio block
assert isinstance(result, list)
assert not any(b.get("type") == "input_audio" for b in result)
def test_audio_incompatible_format_skips(self, ctx, tmp_path):
"""SILK format should be skipped even if supports_audio=True."""
silk_path = tmp_path / "test.silk"
silk_path.write_bytes(b"\x02#!SILK_V3" + b"\x00" * 20)
result = ctx._build_user_content("listen", [str(silk_path)], supports_audio=True)
# SILK is not detected as a known audio format, so it falls through
# to the generic [file: ...] placeholder
assert isinstance(result, list)
assert not any(b.get("type") == "input_audio" for b in result)
def test_mixed_image_and_audio(self, ctx, tmp_path):
"""Both image and audio in same message with both capabilities enabled."""
img_path = tmp_path / "test.png"
img_path.write_bytes(self._make_png())
wav_path = tmp_path / "test.wav"
wav_path.write_bytes(self._make_wav())
result = ctx._build_user_content("check", [str(img_path), str(wav_path)],
supports_vision=True, supports_audio=True)
assert isinstance(result, list)
types = [b.get("type") for b in result if isinstance(b, dict)]
assert "image_url" in types
assert "input_audio" in types
assert "text" in types
def _make_mp4(self) -> bytes:
"""Minimal MP4 with ftyp box (isom brand)."""
# ftyp box: size(4) + 'ftyp'(4) + brand(4) + ...
ftyp_data = b"isom" + b"\x00" * 12 # minor_version + compatible brands
ftyp_box = (8 + len(ftyp_data)).to_bytes(4, "big") + b"ftyp" + ftyp_data
return ftyp_box
def test_video_supports_true_sends_video_url(self, ctx, tmp_path):
mp4_path = tmp_path / "test.mp4"
mp4_path.write_bytes(self._make_mp4())
result = ctx._build_user_content("watch", [str(mp4_path)], supports_video=True)
assert isinstance(result, list)
video_blocks = [b for b in result if b.get("type") == "video_url"]
assert len(video_blocks) == 1
url = video_blocks[0]["video_url"]["url"]
assert url.startswith("data:video/mp4;base64,")
def test_video_supports_false_placeholder(self, ctx, tmp_path):
mp4_path = tmp_path / "test.mp4"
mp4_path.write_bytes(self._make_mp4())
result = ctx._build_user_content("watch", [str(mp4_path)], supports_video=False)
assert isinstance(result, list)
video_blocks = [b for b in result if b.get("type") == "text" and "[video:" in b.get("text", "")]
assert len(video_blocks) == 1
def test_video_supports_none_placeholder(self, ctx, tmp_path):
"""Unconfigured (None) should use [video: path] placeholder."""
mp4_path = tmp_path / "test.mp4"
mp4_path.write_bytes(self._make_mp4())
result = ctx._build_user_content("watch", [str(mp4_path)], supports_video=None)
assert isinstance(result, list)
video_blocks = [b for b in result if b.get("type") == "text" and "[video:" in b.get("text", "")]
assert len(video_blocks) == 1
# ── Audio/Video input limits ──────────────────────────────────────────
class TestInputLimitsAudioVideo:
@pytest.fixture
def ctx(self, tmp_path):
return ContextBuilder(tmp_path, timezone="UTC",
input_limits=InputLimitsConfig(
max_input_images=3,
max_input_image_bytes=10 * 1024 * 1024,
max_input_audio_bytes=100, # 100 bytes for testing
max_input_video_bytes=200, # 200 bytes for testing
))
def _make_wav(self) -> bytes:
"""Minimal valid WAV (~50 bytes)."""
data = b"\x00\x00"
fmt_chunk = (
b"\x01\x00" + (1).to_bytes(2, "little") + (44100).to_bytes(4, "little")
+ (88200).to_bytes(4, "little") + (2).to_bytes(2, "little")
+ (16).to_bytes(2, "little")
)
return (
b"RIFF" + (36 + len(data)).to_bytes(4, "little") + b"WAVE"
+ b"fmt " + (16).to_bytes(4, "little") + fmt_chunk
+ b"data" + len(data).to_bytes(4, "little") + data
)
def _make_mp4(self) -> bytes:
"""Minimal MP4 with ftyp box."""
ftyp_data = b"isom" + b"\x00" * 12
return (8 + len(ftyp_data)).to_bytes(4, "big") + b"ftyp" + ftyp_data
def test_oversized_audio_skipped_with_note(self, ctx, tmp_path):
"""Audio exceeding max_input_audio_bytes should be skipped with note."""
wav_path = tmp_path / "big.wav"
wav_path.write_bytes(self._make_wav() + b"\x00" * 100) # ~150 bytes > 100 limit
result = ctx._build_user_content("listen", [str(wav_path)], supports_audio=True)
assert isinstance(result, str)
assert "[Skipped audio: file too large" in result
assert result.endswith("listen")
def test_audio_within_limit_accepted(self, ctx, tmp_path):
"""Audio within limit should be sent as input_audio."""
wav_path = tmp_path / "small.wav"
wav_path.write_bytes(self._make_wav()) # ~50 bytes < 100 limit
result = ctx._build_user_content("listen", [str(wav_path)], supports_audio=True)
assert isinstance(result, list)
assert any(b.get("type") == "input_audio" for b in result)
def test_oversized_video_skipped_with_note(self, ctx, tmp_path):
"""Video exceeding max_input_video_bytes should be skipped with note."""
mp4_path = tmp_path / "big.mp4"
mp4_path.write_bytes(self._make_mp4() + b"\x00" * 200) # > 200 limit
result = ctx._build_user_content("watch", [str(mp4_path)], supports_video=True)
assert isinstance(result, str)
assert "[Skipped video: file too large" in result
def test_video_within_limit_accepted(self, ctx, tmp_path):
"""Video within limit should be sent as video_url."""
mp4_path = tmp_path / "small.mp4"
mp4_path.write_bytes(self._make_mp4()) # ~24 bytes < 200 limit
result = ctx._build_user_content("watch", [str(mp4_path)], supports_video=True)
assert isinstance(result, list)
assert any(b.get("type") == "video_url" for b in result)
def test_audio_filename_fallback_mp3(self, ctx, tmp_path):
"""MP3 file with unrecognized magic bytes should fallback to filename."""
mp3_path = tmp_path / "test.mp3"
mp3_path.write_bytes(b"\x00" * 50) # unrecognized magic, but .mp3 extension
result = ctx._build_user_content("listen", [str(mp3_path)], supports_audio=True)
assert isinstance(result, list)
audio_blocks = [b for b in result if b.get("type") == "input_audio"]
assert len(audio_blocks) == 1
assert audio_blocks[0]["input_audio"]["format"] == "mp3"
def test_missing_file_gracefully_skipped(self, ctx, tmp_path):
"""Missing file should be gracefully skipped."""
result = ctx._build_user_content("hello", [str(tmp_path / "ghost.wav")], supports_audio=True)
# Missing file is silently skipped (non-image path uses continue on OSError)
assert isinstance(result, str)
assert result == "hello"
# ── _strip_media_content ──────────────────────────────────────────────
class TestStripMediaContent:
def test_no_media_returns_none(self):
msgs = [{"role": "user", "content": "hello"}]
assert LLMProvider._strip_media_content(msgs) is None
def test_strips_image_url(self):
msgs = [{"role": "user", "content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"},
"_meta": {"path": "/img.png"}},
{"type": "text", "text": "desc"},
]}]
result = LLMProvider._strip_media_content(msgs)
assert result is not None
assert result[0]["content"][0] == {"type": "text", "text": "[image: /img.png]"}
assert result[0]["content"][1] == {"type": "text", "text": "desc"}
def test_strips_input_audio(self):
msgs = [{"role": "user", "content": [
{"type": "input_audio", "input_audio": {"data": "abc", "format": "wav"},
"_meta": {"path": "/audio.wav"}},
{"type": "text", "text": "desc"},
]}]
result = LLMProvider._strip_media_content(msgs)
assert result is not None
assert result[0]["content"][0] == {"type": "text", "text": "[audio: /audio.wav]"}
def test_strips_both(self):
msgs = [{"role": "user", "content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"},
"_meta": {"path": "/img.png"}},
{"type": "input_audio", "input_audio": {"data": "abc", "format": "wav"},
"_meta": {"path": "/audio.wav"}},
]}]
result = LLMProvider._strip_media_content(msgs)
assert result is not None
assert len(result[0]["content"]) == 2
assert "[image:" in result[0]["content"][0]["text"]
assert "[audio:" in result[0]["content"][1]["text"]
def test_strips_video_url(self):
msgs = [{"role": "user", "content": [
{"type": "video_url", "video_url": {"url": "data:video/mp4;base64,abc"},
"_meta": {"path": "/video.mp4"}},
{"type": "text", "text": "desc"},
]}]
result = LLMProvider._strip_media_content(msgs)
assert result is not None
assert result[0]["content"][0] == {"type": "text", "text": "[video: /video.mp4]"}
assert result[0]["content"][1] == {"type": "text", "text": "desc"}
def test_string_content_unchanged(self):
msgs = [{"role": "user", "content": "plain text"}]
assert LLMProvider._strip_media_content(msgs) is None
# ── _strip_image_content backward compat ──────────────────────────────
class TestStripImageContentCompat:
def test_still_works(self):
msgs = [{"role": "user", "content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"},
"_meta": {"path": "/img.png"}},
]}]
result = LLMProvider._strip_image_content(msgs)
assert result is not None
assert "[image: /img.png]" in result[0]["content"][0]["text"]
# ── _sanitize_persisted_blocks for input_audio ────────────────────────
class TestSanitizePersistedBlocks:
@pytest.fixture
def loop_mock(self):
from nanobot.agent.loop import AgentLoop
loop = object.__new__(AgentLoop)
return loop
def test_audio_block_replaced_with_placeholder(self, loop_mock):
content = [
{"type": "input_audio", "input_audio": {"data": "abc", "format": "wav"},
"_meta": {"path": "/audio.wav"}},
{"type": "text", "text": "hello"},
]
result = loop_mock._sanitize_persisted_blocks(content)
assert len(result) == 2
assert result[0] == {"type": "text", "text": "[audio: /audio.wav]"}
assert result[1] == {"type": "text", "text": "hello"}
def test_image_block_replaced(self, loop_mock):
content = [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"},
"_meta": {"path": "/img.png"}},
]
result = loop_mock._sanitize_persisted_blocks(content)
assert len(result) == 1
assert "[image: /img.png]" in result[0]["text"]
def test_video_block_replaced_with_placeholder(self, loop_mock):
content = [
{"type": "video_url", "video_url": {"url": "data:video/mp4;base64,abc"},
"_meta": {"path": "/video.mp4"}},
{"type": "text", "text": "hello"},
]
result = loop_mock._sanitize_persisted_blocks(content)
assert len(result) == 2
assert result[0] == {"type": "text", "text": "[video: /video.mp4]"}
assert result[1] == {"type": "text", "text": "hello"}
def test_non_data_image_unchanged(self, loop_mock):
"""Non-data URI image (already a placeholder) should pass through."""
content = [
{"type": "image_url", "image_url": {"url": "https://example.com/img.png"}},
]
result = loop_mock._sanitize_persisted_blocks(content)
assert len(result) == 1
assert result[0]["type"] == "image_url"
# ── Anthropic provider input_audio handling ────────────────────────────
class TestAnthropicAudioConversion:
def test_input_audio_converted_to_text(self):
from nanobot.providers.anthropic_provider import AnthropicProvider
provider = AnthropicProvider.__new__(AnthropicProvider)
content = [
{"type": "input_audio", "input_audio": {"data": "abc", "format": "wav"},
"_meta": {"path": "/test.wav"}},
{"type": "text", "text": "listen"},
]
result = provider._convert_user_content(content)
assert isinstance(result, list)
assert any("[audio:" in b.get("text", "") for b in result if b.get("type") == "text")
# ── OpenAI Codex provider input_audio handling ─────────────────────────
class TestCodexAudioConversion:
def test_input_audio_passed_through(self):
from nanobot.providers.openai_codex_provider import _convert_user_message
content = [
{"type": "input_audio", "input_audio": {"data": "abc123", "format": "wav"}},
{"type": "text", "text": "listen"},
]
result = _convert_user_message(content)
assert result["role"] == "user"
audio_items = [i for i in result["content"] if i.get("type") == "input_audio"]
assert len(audio_items) == 1
assert audio_items[0]["input_audio"]["data"] == "abc123"