nanobot/tests/test_context_multimodal.py
chengyongru b9346b0d59 feat: generalize multimodal support with audio/video handling
Add comprehensive audio and video support across the agent pipeline:

- Generalize media placeholder system: _strip_image_content → _strip_media_content,
  _media_placeholder with type-specific labels, unified across providers
- Add detect_audio_mime with magic-byte detection and filename fallback
- Add _AUDIO_FORMAT_MAP for correct MIME-to-API-format conversion
- Add InputLimitsConfig with count limits (max_input_audios/videos) and byte limits
- Support input_audio blocks in context builder with OpenAI-compatible format
- Support video_url blocks with base64 inline data
- Add audio/video passthrough in Codex provider, placeholder fallback in Anthropic provider
- Thread supports_vision/audio/video capability flags through AgentLoop
- Unify placeholder format: [audio: path]/[video: path] instead of generic [file: path]
- Optimize file I/O: single read_bytes() instead of header+full double reads
- Extract _STRIP_MEDIA_TYPES as class constant to avoid per-call allocation
2026-04-08 23:14:40 +08:00

105 lines
3.8 KiB
Python

from pathlib import Path
from nanobot.agent.context import ContextBuilder
from nanobot.config.schema import InputLimitsConfig
PNG_BYTES = (
b"\x89PNG\r\n\x1a\n"
b"\x00\x00\x00\rIHDR"
b"\x00\x00\x00\x01\x00\x00\x00\x01\x08\x02\x00\x00\x00"
b"\x90wS\xde"
b"\x00\x00\x00\x0cIDATx\x9cc``\x00\x00\x00\x04\x00\x01"
b"\x0b\x0e-\xb4"
b"\x00\x00\x00\x00IEND\xaeB`\x82"
)
def _builder(tmp_path: Path, input_limits: InputLimitsConfig | None = None) -> ContextBuilder:
return ContextBuilder(tmp_path, input_limits=input_limits)
def test_build_user_content_keeps_only_first_three_images(tmp_path: Path) -> None:
builder = _builder(tmp_path)
max_images = builder.input_limits.max_input_images
paths = []
for i in range(max_images + 1):
path = tmp_path / f"img{i}.png"
path.write_bytes(PNG_BYTES)
paths.append(str(path))
content = builder._build_user_content("describe these", paths)
assert isinstance(content, list)
assert sum(1 for block in content if block.get("type") == "image_url") == max_images
text_block = content[-1]["text"]
assert "[Skipped 1 image: only the first 3 images are included]" in text_block
def test_build_user_content_skips_invalid_images_with_note(tmp_path: Path) -> None:
builder = _builder(tmp_path)
# .txt extension → mimetypes does NOT guess image/*, so it's rejected
bad = tmp_path / "not-image.txt"
bad.write_text("hello", encoding="utf-8")
content = builder._build_user_content("what is this?", [str(bad)])
# .txt is not an image MIME → goes to non-image path → [file: ...] placeholder
assert isinstance(content, list)
assert any("[file:" in b.get("text", "") for b in content)
def test_build_user_content_skips_missing_file(tmp_path: Path) -> None:
builder = _builder(tmp_path)
content = builder._build_user_content("hello", [str(tmp_path / "ghost.png")])
assert isinstance(content, str)
assert "[Skipped image: unable to read (ghost.png)]" in content
assert content.endswith("hello")
def test_build_user_content_skips_large_images_with_note(tmp_path: Path) -> None:
builder = _builder(tmp_path)
big = tmp_path / "big.png"
big.write_bytes(PNG_BYTES + b"x" * builder.input_limits.max_input_image_bytes)
content = builder._build_user_content("analyze", [str(big)])
limit_mb = builder.input_limits.max_input_image_bytes // (1024 * 1024)
assert isinstance(content, str)
assert f"[Skipped image: file too large (big.png, limit {limit_mb} MB)]" in content
def test_build_user_content_respects_custom_input_limits(tmp_path: Path) -> None:
builder = _builder(
tmp_path,
input_limits=InputLimitsConfig(max_input_images=1, max_input_image_bytes=1024),
)
small = tmp_path / "small.png"
large = tmp_path / "large.png"
small.write_bytes(PNG_BYTES)
large.write_bytes(PNG_BYTES + b"x" * 1024)
content = builder._build_user_content("describe", [str(small), str(large)])
assert isinstance(content, list)
assert sum(1 for block in content if block.get("type") == "image_url") == 1
assert "[Skipped 1 image: only the first 1 images are included]" in content[-1]["text"]
def test_build_user_content_keeps_valid_images_and_skip_notes_together(tmp_path: Path) -> None:
builder = _builder(tmp_path)
good = tmp_path / "good.png"
bad = tmp_path / "bad.txt"
good.write_bytes(PNG_BYTES)
bad.write_text("oops", encoding="utf-8")
content = builder._build_user_content("check both", [str(good), str(bad)])
assert isinstance(content, list)
assert content[0]["type"] == "image_url"
# .txt is non-image → goes to non-image path → [file: ...] placeholder
file_blocks = [b for b in content if b.get("type") == "text" and "[file:" in b.get("text", "")]
assert len(file_blocks) == 1