Add document extraction channel toggle

2026-06-15 07:14:08 +00:00 · 2026-05-29 11:29:20 +08:00 · 2026-05-29 11:29:20 +08:00 · ec4f9e9857
commit ec4f9e9857
parent 404b68cdd4
5 changed files with 225 additions and 3 deletions
--- a/docs/configuration.md
+++ b/docs/configuration.md
@ -1043,6 +1043,7 @@ Global settings that apply to all channels. Configure under the `channels` secti
  "channels": {
    "sendProgress": true,
    "sendToolHints": false,
+    "extractDocumentText": true,
    "sendMaxRetries": 3,
    "transcriptionProvider": "groq",
    "transcriptionLanguage": null,
@ -1056,6 +1057,7 @@ Global settings that apply to all channels. Configure under the `channels` secti
 | `sendProgress` | `true` | Stream agent's text progress to the channel |
 | `sendToolHints` | `false` | Stream tool-call hints (e.g. `read_file("…")`) |
 | `showReasoning` | `true` | Allow channels to surface model reasoning/thinking content (DeepSeek-R1 `reasoning_content`, Anthropic `thinking_blocks`, inline `<think>` tags). Reasoning flows as a dedicated stream with `_reasoning_delta` / `_reasoning_end` markers — channels override `send_reasoning_delta` / `send_reasoning_end` to render in-place updates. Even with `true`, channels without those overrides stay no-op silently. Currently surfaced on CLI and WebSocket/WebUI (italic shimmer header, auto-collapses after the stream ends); Telegram / Slack / Discord / Feishu / WeChat / Matrix keep the base no-op until their bubble UI is adapted. Independent of `sendProgress`. |
+| `extractDocumentText` | `true` | Extract supported document/text attachments into the model prompt. Set to `false` to keep document content out of the prompt and include attachment path references instead. |
 | `sendMaxRetries` | `3` | Max delivery attempts per outbound message, including the initial send (0-10 configured, minimum 1 actual attempt) |
 | `transcriptionProvider` | `"groq"` | Voice transcription backend: `"groq"` (free tier, default) or `"openai"`. API key and optional `apiBase` are auto-resolved from the matching provider config. Chat-style bases such as `https://api.groq.com/openai/v1` are normalized to the audio transcription endpoint. |
 | `transcriptionLanguage` | `null` | Optional ISO-639-1 language hint for audio transcription, e.g. `"en"`, `"ko"`, `"ja"`. |
--- a/nanobot/agent/loop.py
+++ b/nanobot/agent/loop.py
@ -4,6 +4,7 @@ from __future__ import annotations

 import asyncio
 import dataclasses
+import mimetypes
 import os
 import time
 from contextlib import AsyncExitStack, nullcontext, suppress
@ -51,7 +52,7 @@ from nanobot.session.webui_turns import (
    mark_webui_session,
 )
 from nanobot.utils.document import extract_documents
-from nanobot.utils.helpers import image_placeholder_text
+from nanobot.utils.helpers import detect_image_mime, image_placeholder_text
 from nanobot.utils.helpers import truncate_text as truncate_text_fn
 from nanobot.utils.image_generation_intent import image_generation_prompt
 from nanobot.utils.llm_runtime import LLMRuntime
@ -711,7 +712,7 @@ class AgentLoop:
                content = pending_msg.content
                media = pending_msg.media if pending_msg.media else None
                if media:
-                    content, media = extract_documents(content, media)
+                    content, media = self._prepare_message_media(content, media)
                    media = media or None
                user_content = self.context._build_user_content(content, media)
                return {"role": "user", "content": user_content}
@ -1271,7 +1272,7 @@ class AgentLoop:
        msg = ctx.msg

        if msg.media:
-            new_content, image_only = extract_documents(msg.content, msg.media)
+            new_content, image_only = self._prepare_message_media(msg.content, msg.media)
            ctx.msg = dataclasses.replace(msg, content=new_content, media=image_only)
            msg = ctx.msg

@ -1292,6 +1293,49 @@ class AgentLoop:

        return "ok"

+    def _prepare_message_media(self, content: str, media: list[str]) -> tuple[str, list[str]]:
+        if self._should_extract_document_text():
+            return extract_documents(content, media)
+        return self._reference_non_image_attachments(content, media)
+
+    def _should_extract_document_text(self) -> bool:
+        cfg = self.channels_config
+        if cfg is None:
+            return True
+        if isinstance(cfg, dict):
+            value = cfg.get("extract_document_text", cfg.get("extractDocumentText", True))
+        else:
+            value = getattr(cfg, "extract_document_text", True)
+        return value is not False
+
+    @staticmethod
+    def _reference_non_image_attachments(content: str, media: list[str]) -> tuple[str, list[str]]:
+        image_paths: list[str] = []
+        attachment_refs: list[str] = []
+        for path in media:
+            if AgentLoop._looks_like_image(path):
+                image_paths.append(path)
+            else:
+                attachment_refs.append(f"[Attachment: {path}]")
+        if attachment_refs:
+            suffix = "\n".join(attachment_refs)
+            content = f"{content}\n\n{suffix}" if content else suffix
+        return content, image_paths
+
+    @staticmethod
+    def _looks_like_image(path: str) -> bool:
+        p = Path(path)
+        mime: str | None = None
+        if p.is_file():
+            try:
+                with p.open("rb") as f:
+                    mime = detect_image_mime(f.read(16))
+            except OSError:
+                mime = None
+        if not mime:
+            mime = mimetypes.guess_type(path)[0]
+        return bool(mime and mime.startswith("image/"))
+
    async def _state_compact(self, ctx: TurnContext) -> str:
        ctx.session, pending = self.auto_compact.prepare_session(ctx.session, ctx.session_key)
        ctx.pending_summary = pending
--- a/nanobot/config/schema.py
+++ b/nanobot/config/schema.py
@ -37,6 +37,7 @@ class ChannelsConfig(Base):
    send_progress: bool = True  # stream agent's text progress to the channel
    send_tool_hints: bool = False  # stream tool-call hints (e.g. read_file("…"))
    show_reasoning: bool = True  # surface model reasoning when channel implements it
+    extract_document_text: bool = True  # extract text from document attachments before sending to the model
    send_max_retries: int = Field(default=3, ge=0, le=10)  # Max delivery attempts (initial send included)
    transcription_provider: str = "groq"  # Voice transcription backend: "groq" or "openai"
    transcription_language: str | None = Field(default=None, pattern=r"^[a-z]{2,3}$")  # Optional ISO-639-1 hint for audio transcription
--- a/tests/agent/test_document_extraction_toggle.py
+++ b/tests/agent/test_document_extraction_toggle.py
@ -0,0 +1,168 @@
+import asyncio
+import base64
+from pathlib import Path
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from nanobot.agent.loop import AgentLoop, TurnContext, TurnState
+from nanobot.bus.events import InboundMessage
+from nanobot.bus.queue import MessageBus
+from nanobot.config.schema import ChannelsConfig
+from nanobot.providers.base import LLMResponse
+
+
+def _make_loop(tmp_path: Path, channels_config: ChannelsConfig | None = None) -> AgentLoop:
+    provider = MagicMock()
+    provider.get_default_model.return_value = "test-model"
+    provider.chat_with_retry = AsyncMock(return_value=LLMResponse(content="ok"))
+    return AgentLoop(
+        bus=MessageBus(),
+        provider=provider,
+        workspace=tmp_path,
+        model="test-model",
+        channels_config=channels_config,
+    )
+
+
+@pytest.mark.asyncio
+async def test_state_restore_extracts_documents_by_default(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    loop = _make_loop(tmp_path)
+    doc_path = tmp_path / "report.txt"
+    doc_path.write_text("Quarterly revenue is $5M", encoding="utf-8")
+    calls: list[tuple[str, list[str]]] = []
+
+    def fake_extract_documents(content: str, media: list[str]) -> tuple[str, list[str]]:
+        calls.append((content, media))
+        return f"{content}\n\n[File: report.txt]\nQuarterly revenue is $5M", []
+
+    monkeypatch.setattr("nanobot.agent.loop.extract_documents", fake_extract_documents)
+
+    ctx = TurnContext(
+        msg=InboundMessage(
+            channel="cli",
+            sender_id="u",
+            chat_id="c",
+            content="summarize",
+            media=[str(doc_path)],
+        ),
+        session_key="cli:c",
+        state=TurnState.RESTORE,
+        turn_id="turn-1",
+    )
+
+    assert await loop._state_restore(ctx) == "ok"
+
+    assert calls == [("summarize", [str(doc_path)])]
+    assert "Quarterly revenue" in ctx.msg.content
+    assert ctx.msg.media == []
+
+
+@pytest.mark.asyncio
+async def test_state_restore_references_documents_when_extraction_disabled(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    loop = _make_loop(tmp_path, ChannelsConfig(extract_document_text=False))
+    doc_path = tmp_path / "report.txt"
+    doc_path.write_text("Quarterly revenue is $5M", encoding="utf-8")
+
+    def fail_extract_documents(content: str, media: list[str]) -> tuple[str, list[str]]:
+        raise AssertionError("document extraction should be disabled")
+
+    monkeypatch.setattr("nanobot.agent.loop.extract_documents", fail_extract_documents)
+
+    ctx = TurnContext(
+        msg=InboundMessage(
+            channel="cli",
+            sender_id="u",
+            chat_id="c",
+            content="summarize",
+            media=[str(doc_path)],
+        ),
+        session_key="cli:c",
+        state=TurnState.RESTORE,
+        turn_id="turn-1",
+    )
+
+    assert await loop._state_restore(ctx) == "ok"
+
+    assert "Quarterly revenue" not in ctx.msg.content
+    assert f"[Attachment: {doc_path}]" in ctx.msg.content
+    assert ctx.msg.media == []
+
+
+@pytest.mark.asyncio
+async def test_pending_followup_references_documents_when_extraction_disabled(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    doc_path = tmp_path / "followup.txt"
+    doc_path.write_text("Do not inject this file body", encoding="utf-8")
+    captured_messages: list[list[dict]] = []
+    call_count = {"n": 0}
+
+    async def chat_with_retry(*, messages: list[dict], **kwargs: object) -> LLMResponse:
+        call_count["n"] += 1
+        captured_messages.append([dict(message) for message in messages])
+        return LLMResponse(content=f"answer-{call_count['n']}", tool_calls=[], usage={})
+
+    loop = _make_loop(tmp_path, ChannelsConfig(extract_document_text=False))
+    loop.provider.chat_with_retry = chat_with_retry
+    loop.tools.get_definitions = MagicMock(return_value=[])
+
+    def fail_extract_documents(content: str, media: list[str]) -> tuple[str, list[str]]:
+        raise AssertionError("document extraction should be disabled")
+
+    monkeypatch.setattr("nanobot.agent.loop.extract_documents", fail_extract_documents)
+
+    pending_queue: asyncio.Queue[InboundMessage] = asyncio.Queue()
+    await pending_queue.put(
+        InboundMessage(
+            channel="cli",
+            sender_id="u",
+            chat_id="c",
+            content="check this",
+            media=[str(doc_path)],
+        )
+    )
+
+    final_content, _, _, _, had_injections = await loop._run_agent_loop(
+        [{"role": "user", "content": "hello"}],
+        channel="cli",
+        chat_id="c",
+        pending_queue=pending_queue,
+    )
+
+    assert final_content == "answer-2"
+    assert had_injections is True
+    injected_user_content = [
+        message["content"]
+        for message in captured_messages[-1]
+        if message.get("role") == "user" and isinstance(message.get("content"), str)
+    ][-1]
+    assert "check this" in injected_user_content
+    assert f"[Attachment: {doc_path}]" in injected_user_content
+    assert "Do not inject this file body" not in injected_user_content
+
+
+def test_document_extraction_disabled_still_preserves_images(tmp_path: Path) -> None:
+    image_path = tmp_path / "chart.png"
+    image_path.write_bytes(
+        base64.b64decode(
+            "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+yF9kAAAAASUVORK5CYII="
+        )
+    )
+    doc_path = tmp_path / "report.txt"
+    doc_path.write_text("manual extraction target", encoding="utf-8")
+
+    content, media = AgentLoop._reference_non_image_attachments(
+        "review these",
+        [str(image_path), str(doc_path)],
+    )
+
+    assert media == [str(image_path)]
+    assert f"[Attachment: {doc_path}]" in content
--- a/tests/channels/test_channel_plugins.py
+++ b/tests/channels/test_channel_plugins.py
@ -91,6 +91,13 @@ def test_channels_config_builtin_fields_removed():
    assert not hasattr(cfg, "telegram")
    assert cfg.send_progress is True
    assert cfg.send_tool_hints is False
+    assert cfg.extract_document_text is True
+
+
+def test_channels_config_extract_document_text_accepts_camel_alias():
+    cfg = ChannelsConfig.model_validate({"extractDocumentText": False})
+
+    assert cfg.extract_document_text is False


 # ---------------------------------------------------------------------------