From c937c07178b517067733bf05037aac98d031bb1c Mon Sep 17 00:00:00 2001
From: Xubin Ren <xubinrencs@gmail.com>
Date: Tue, 14 Apr 2026 13:15:04 +0000
Subject: [PATCH] fix: two bugs in document extraction pipeline

Bug 1: _drain_pending did not call extract_documents on follow-up
messages arriving mid-turn. Documents attached to queued messages were
silently dropped because _build_user_content only handles images.
Fix: call extract_documents before _build_user_content in _drain_pending.

Bug 2: extract_documents read the entire file into memory (up to 50 MB)
just to check 16 bytes of magic header for MIME detection.
Fix: read only the first 16 bytes via open()+read(16) instead of
Path.read_bytes().

Added regression tests for both bugs.

Made-with: Cursor
---
 nanobot/agent/loop.py           | 10 +++---
 nanobot/utils/document.py       |  5 +--
 tests/test_api_attachment.py    | 26 +++++++++++++++
 tests/test_context_documents.py | 57 ++++++++++++++++++++++++++++++++-
 4 files changed, 91 insertions(+), 7 deletions(-)

diff --git a/nanobot/agent/loop.py b/nanobot/agent/loop.py
index 2f80cd94a..206941941 100644
--- a/nanobot/agent/loop.py
+++ b/nanobot/agent/loop.py
@@ -385,10 +385,12 @@ class AgentLoop:
                     pending_msg = pending_queue.get_nowait()
                 except asyncio.QueueEmpty:
                     break
-                user_content = self.context._build_user_content(
-                    pending_msg.content,
-                    pending_msg.media if pending_msg.media else None,
-                )
+                content = pending_msg.content
+                media = pending_msg.media if pending_msg.media else None
+                if media:
+                    content, media = extract_documents(content, media)
+                    media = media or None
+                user_content = self.context._build_user_content(content, media)
                 runtime_ctx = self.context._build_runtime_context(
                     pending_msg.channel,
                     pending_msg.chat_id,
diff --git a/nanobot/utils/document.py b/nanobot/utils/document.py
index e2aaeade0..a27b0e7ad 100644
--- a/nanobot/utils/document.py
+++ b/nanobot/utils/document.py
@@ -251,8 +251,9 @@ def extract_documents(
             )
             continue
 
-        raw = p.read_bytes()
-        mime = detect_image_mime(raw) or mimetypes.guess_type(path_str)[0]
+        with open(p, "rb") as f:
+            header = f.read(16)
+        mime = detect_image_mime(header) or mimetypes.guess_type(path_str)[0]
         if mime and mime.startswith("image/"):
             image_paths.append(path_str)
         else:
diff --git a/tests/test_api_attachment.py b/tests/test_api_attachment.py
index 6bd3676ce..92e09ef88 100644
--- a/tests/test_api_attachment.py
+++ b/tests/test_api_attachment.py
@@ -429,6 +429,32 @@ def test_extract_documents_skips_oversized_files(tmp_path) -> None:
     assert image_paths == []
 
 
+def test_extract_documents_does_not_read_full_file_for_mime(tmp_path) -> None:
+    """MIME detection should only read header bytes, not the entire file."""
+    from pathlib import Path as _Path
+
+    big_txt = tmp_path / "big.txt"
+    big_txt.write_bytes(b"hello world " * 100_000)  # ~1.2 MB
+
+    original_read_bytes = _Path.read_bytes
+    read_sizes: list[int] = []
+
+    def _tracking_read_bytes(self):
+        data = original_read_bytes(self)
+        read_sizes.append(len(data))
+        return data
+
+    import unittest.mock
+    with unittest.mock.patch.object(_Path, "read_bytes", _tracking_read_bytes):
+        extract_documents("test", [str(big_txt)])
+
+    # If the full file was read for MIME detection, read_sizes would
+    # contain a >1MB entry.  After the fix, only a small header is read.
+    assert all(size <= 4096 for size in read_sizes), (
+        f"extract_documents read full file for MIME detection: sizes={read_sizes}"
+    )
+
+
 # ---------------------------------------------------------------------------
 # DOCX upload test — API saves file, loop layer extracts text
 # ---------------------------------------------------------------------------
diff --git a/tests/test_context_documents.py b/tests/test_context_documents.py
index 28a4f6d20..7d9ac9083 100644
--- a/tests/test_context_documents.py
+++ b/tests/test_context_documents.py
@@ -1,7 +1,8 @@
 """Tests for context builder media handling.
 
 The ContextBuilder._build_user_content method should ONLY handle images.
-Document text extraction is the responsibility of the API layer.
+Document text extraction is the responsibility of the processing layer
+(AgentLoop._process_message and _drain_pending).
 """
 
 from __future__ import annotations
@@ -9,6 +10,7 @@ from __future__ import annotations
 from pathlib import Path
 
 from nanobot.agent.context import ContextBuilder
+from nanobot.utils.document import extract_documents
 
 
 def _make_builder(tmp_path: Path) -> ContextBuilder:
@@ -56,3 +58,56 @@ def test_build_user_content_mixed_image_and_non_image(tmp_path: Path) -> None:
     assert any(b["type"] == "image_url" for b in result)
     text_parts = [b.get("text", "") for b in result if b.get("type") == "text"]
     assert all("report text" not in t for t in text_parts)
+
+
+# ---------------------------------------------------------------------------
+# Bug detection: extract_documents must be called BEFORE _build_user_content
+# to prevent document media from being silently dropped.
+# This simulates the _drain_pending code path.
+# ---------------------------------------------------------------------------
+
+def test_drain_pending_path_preserves_document_text(tmp_path: Path) -> None:
+    """Simulates the _drain_pending path: a pending follow-up message
+    with a document attachment must have its text extracted before being
+    passed to _build_user_content.  Without extract_documents, the
+    document is silently dropped."""
+    from docx import Document
+
+    doc = Document()
+    doc.add_paragraph("Quarterly revenue is $5M")
+    docx_path = tmp_path / "report.docx"
+    doc.save(docx_path)
+
+    content = "summarize"
+    media = [str(docx_path)]
+
+    # Step 1: extract_documents separates docs from images
+    new_content, image_only = extract_documents(content, media)
+
+    # Step 2: _build_user_content handles only images (none left here)
+    builder = _make_builder(tmp_path)
+    result = builder._build_user_content(new_content, image_only if image_only else None)
+
+    # The document text should be present in the final content
+    assert "Quarterly revenue" in result
+    assert "summarize" in result
+
+
+def test_drain_pending_path_without_extract_loses_document(tmp_path: Path) -> None:
+    """Demonstrates the BUG: if _drain_pending calls _build_user_content
+    directly without extract_documents, document content is lost."""
+    from docx import Document
+
+    doc = Document()
+    doc.add_paragraph("Secret data in document")
+    docx_path = tmp_path / "report.docx"
+    doc.save(docx_path)
+
+    builder = _make_builder(tmp_path)
+
+    # Bug path: call _build_user_content directly with document media
+    result = builder._build_user_content("summarize", [str(docx_path)])
+
+    # The document text is LOST — _build_user_content ignores non-images
+    assert result == "summarize"  # only the original text, no doc content
+    assert "Secret data" not in result