From c937c07178b517067733bf05037aac98d031bb1c Mon Sep 17 00:00:00 2001 From: Xubin Ren Date: Tue, 14 Apr 2026 13:15:04 +0000 Subject: [PATCH] fix: two bugs in document extraction pipeline Bug 1: _drain_pending did not call extract_documents on follow-up messages arriving mid-turn. Documents attached to queued messages were silently dropped because _build_user_content only handles images. Fix: call extract_documents before _build_user_content in _drain_pending. Bug 2: extract_documents read the entire file into memory (up to 50 MB) just to check 16 bytes of magic header for MIME detection. Fix: read only the first 16 bytes via open()+read(16) instead of Path.read_bytes(). Added regression tests for both bugs. Made-with: Cursor --- nanobot/agent/loop.py | 10 +++--- nanobot/utils/document.py | 5 +-- tests/test_api_attachment.py | 26 +++++++++++++++ tests/test_context_documents.py | 57 ++++++++++++++++++++++++++++++++- 4 files changed, 91 insertions(+), 7 deletions(-) diff --git a/nanobot/agent/loop.py b/nanobot/agent/loop.py index 2f80cd94a..206941941 100644 --- a/nanobot/agent/loop.py +++ b/nanobot/agent/loop.py @@ -385,10 +385,12 @@ class AgentLoop: pending_msg = pending_queue.get_nowait() except asyncio.QueueEmpty: break - user_content = self.context._build_user_content( - pending_msg.content, - pending_msg.media if pending_msg.media else None, - ) + content = pending_msg.content + media = pending_msg.media if pending_msg.media else None + if media: + content, media = extract_documents(content, media) + media = media or None + user_content = self.context._build_user_content(content, media) runtime_ctx = self.context._build_runtime_context( pending_msg.channel, pending_msg.chat_id, diff --git a/nanobot/utils/document.py b/nanobot/utils/document.py index e2aaeade0..a27b0e7ad 100644 --- a/nanobot/utils/document.py +++ b/nanobot/utils/document.py @@ -251,8 +251,9 @@ def extract_documents( ) continue - raw = p.read_bytes() - mime = detect_image_mime(raw) or mimetypes.guess_type(path_str)[0] + with open(p, "rb") as f: + header = f.read(16) + mime = detect_image_mime(header) or mimetypes.guess_type(path_str)[0] if mime and mime.startswith("image/"): image_paths.append(path_str) else: diff --git a/tests/test_api_attachment.py b/tests/test_api_attachment.py index 6bd3676ce..92e09ef88 100644 --- a/tests/test_api_attachment.py +++ b/tests/test_api_attachment.py @@ -429,6 +429,32 @@ def test_extract_documents_skips_oversized_files(tmp_path) -> None: assert image_paths == [] +def test_extract_documents_does_not_read_full_file_for_mime(tmp_path) -> None: + """MIME detection should only read header bytes, not the entire file.""" + from pathlib import Path as _Path + + big_txt = tmp_path / "big.txt" + big_txt.write_bytes(b"hello world " * 100_000) # ~1.2 MB + + original_read_bytes = _Path.read_bytes + read_sizes: list[int] = [] + + def _tracking_read_bytes(self): + data = original_read_bytes(self) + read_sizes.append(len(data)) + return data + + import unittest.mock + with unittest.mock.patch.object(_Path, "read_bytes", _tracking_read_bytes): + extract_documents("test", [str(big_txt)]) + + # If the full file was read for MIME detection, read_sizes would + # contain a >1MB entry. After the fix, only a small header is read. + assert all(size <= 4096 for size in read_sizes), ( + f"extract_documents read full file for MIME detection: sizes={read_sizes}" + ) + + # --------------------------------------------------------------------------- # DOCX upload test — API saves file, loop layer extracts text # --------------------------------------------------------------------------- diff --git a/tests/test_context_documents.py b/tests/test_context_documents.py index 28a4f6d20..7d9ac9083 100644 --- a/tests/test_context_documents.py +++ b/tests/test_context_documents.py @@ -1,7 +1,8 @@ """Tests for context builder media handling. The ContextBuilder._build_user_content method should ONLY handle images. -Document text extraction is the responsibility of the API layer. +Document text extraction is the responsibility of the processing layer +(AgentLoop._process_message and _drain_pending). """ from __future__ import annotations @@ -9,6 +10,7 @@ from __future__ import annotations from pathlib import Path from nanobot.agent.context import ContextBuilder +from nanobot.utils.document import extract_documents def _make_builder(tmp_path: Path) -> ContextBuilder: @@ -56,3 +58,56 @@ def test_build_user_content_mixed_image_and_non_image(tmp_path: Path) -> None: assert any(b["type"] == "image_url" for b in result) text_parts = [b.get("text", "") for b in result if b.get("type") == "text"] assert all("report text" not in t for t in text_parts) + + +# --------------------------------------------------------------------------- +# Bug detection: extract_documents must be called BEFORE _build_user_content +# to prevent document media from being silently dropped. +# This simulates the _drain_pending code path. +# --------------------------------------------------------------------------- + +def test_drain_pending_path_preserves_document_text(tmp_path: Path) -> None: + """Simulates the _drain_pending path: a pending follow-up message + with a document attachment must have its text extracted before being + passed to _build_user_content. Without extract_documents, the + document is silently dropped.""" + from docx import Document + + doc = Document() + doc.add_paragraph("Quarterly revenue is $5M") + docx_path = tmp_path / "report.docx" + doc.save(docx_path) + + content = "summarize" + media = [str(docx_path)] + + # Step 1: extract_documents separates docs from images + new_content, image_only = extract_documents(content, media) + + # Step 2: _build_user_content handles only images (none left here) + builder = _make_builder(tmp_path) + result = builder._build_user_content(new_content, image_only if image_only else None) + + # The document text should be present in the final content + assert "Quarterly revenue" in result + assert "summarize" in result + + +def test_drain_pending_path_without_extract_loses_document(tmp_path: Path) -> None: + """Demonstrates the BUG: if _drain_pending calls _build_user_content + directly without extract_documents, document content is lost.""" + from docx import Document + + doc = Document() + doc.add_paragraph("Secret data in document") + docx_path = tmp_path / "report.docx" + doc.save(docx_path) + + builder = _make_builder(tmp_path) + + # Bug path: call _build_user_content directly with document media + result = builder._build_user_content("summarize", [str(docx_path)]) + + # The document text is LOST — _build_user_content ignores non-images + assert result == "summarize" # only the original text, no doc content + assert "Secret data" not in result