mirror of
https://github.com/HKUDS/nanobot.git
synced 2026-06-14 06:43:53 +00:00
fix: two bugs in document extraction pipeline
Bug 1: _drain_pending did not call extract_documents on follow-up messages arriving mid-turn. Documents attached to queued messages were silently dropped because _build_user_content only handles images. Fix: call extract_documents before _build_user_content in _drain_pending. Bug 2: extract_documents read the entire file into memory (up to 50 MB) just to check 16 bytes of magic header for MIME detection. Fix: read only the first 16 bytes via open()+read(16) instead of Path.read_bytes(). Added regression tests for both bugs. Made-with: Cursor
This commit is contained in:
parent
92d6fca323
commit
c937c07178
@ -385,10 +385,12 @@ class AgentLoop:
|
|||||||
pending_msg = pending_queue.get_nowait()
|
pending_msg = pending_queue.get_nowait()
|
||||||
except asyncio.QueueEmpty:
|
except asyncio.QueueEmpty:
|
||||||
break
|
break
|
||||||
user_content = self.context._build_user_content(
|
content = pending_msg.content
|
||||||
pending_msg.content,
|
media = pending_msg.media if pending_msg.media else None
|
||||||
pending_msg.media if pending_msg.media else None,
|
if media:
|
||||||
)
|
content, media = extract_documents(content, media)
|
||||||
|
media = media or None
|
||||||
|
user_content = self.context._build_user_content(content, media)
|
||||||
runtime_ctx = self.context._build_runtime_context(
|
runtime_ctx = self.context._build_runtime_context(
|
||||||
pending_msg.channel,
|
pending_msg.channel,
|
||||||
pending_msg.chat_id,
|
pending_msg.chat_id,
|
||||||
|
|||||||
@ -251,8 +251,9 @@ def extract_documents(
|
|||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
raw = p.read_bytes()
|
with open(p, "rb") as f:
|
||||||
mime = detect_image_mime(raw) or mimetypes.guess_type(path_str)[0]
|
header = f.read(16)
|
||||||
|
mime = detect_image_mime(header) or mimetypes.guess_type(path_str)[0]
|
||||||
if mime and mime.startswith("image/"):
|
if mime and mime.startswith("image/"):
|
||||||
image_paths.append(path_str)
|
image_paths.append(path_str)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -429,6 +429,32 @@ def test_extract_documents_skips_oversized_files(tmp_path) -> None:
|
|||||||
assert image_paths == []
|
assert image_paths == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_documents_does_not_read_full_file_for_mime(tmp_path) -> None:
|
||||||
|
"""MIME detection should only read header bytes, not the entire file."""
|
||||||
|
from pathlib import Path as _Path
|
||||||
|
|
||||||
|
big_txt = tmp_path / "big.txt"
|
||||||
|
big_txt.write_bytes(b"hello world " * 100_000) # ~1.2 MB
|
||||||
|
|
||||||
|
original_read_bytes = _Path.read_bytes
|
||||||
|
read_sizes: list[int] = []
|
||||||
|
|
||||||
|
def _tracking_read_bytes(self):
|
||||||
|
data = original_read_bytes(self)
|
||||||
|
read_sizes.append(len(data))
|
||||||
|
return data
|
||||||
|
|
||||||
|
import unittest.mock
|
||||||
|
with unittest.mock.patch.object(_Path, "read_bytes", _tracking_read_bytes):
|
||||||
|
extract_documents("test", [str(big_txt)])
|
||||||
|
|
||||||
|
# If the full file was read for MIME detection, read_sizes would
|
||||||
|
# contain a >1MB entry. After the fix, only a small header is read.
|
||||||
|
assert all(size <= 4096 for size in read_sizes), (
|
||||||
|
f"extract_documents read full file for MIME detection: sizes={read_sizes}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# DOCX upload test — API saves file, loop layer extracts text
|
# DOCX upload test — API saves file, loop layer extracts text
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|||||||
@ -1,7 +1,8 @@
|
|||||||
"""Tests for context builder media handling.
|
"""Tests for context builder media handling.
|
||||||
|
|
||||||
The ContextBuilder._build_user_content method should ONLY handle images.
|
The ContextBuilder._build_user_content method should ONLY handle images.
|
||||||
Document text extraction is the responsibility of the API layer.
|
Document text extraction is the responsibility of the processing layer
|
||||||
|
(AgentLoop._process_message and _drain_pending).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
@ -9,6 +10,7 @@ from __future__ import annotations
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from nanobot.agent.context import ContextBuilder
|
from nanobot.agent.context import ContextBuilder
|
||||||
|
from nanobot.utils.document import extract_documents
|
||||||
|
|
||||||
|
|
||||||
def _make_builder(tmp_path: Path) -> ContextBuilder:
|
def _make_builder(tmp_path: Path) -> ContextBuilder:
|
||||||
@ -56,3 +58,56 @@ def test_build_user_content_mixed_image_and_non_image(tmp_path: Path) -> None:
|
|||||||
assert any(b["type"] == "image_url" for b in result)
|
assert any(b["type"] == "image_url" for b in result)
|
||||||
text_parts = [b.get("text", "") for b in result if b.get("type") == "text"]
|
text_parts = [b.get("text", "") for b in result if b.get("type") == "text"]
|
||||||
assert all("report text" not in t for t in text_parts)
|
assert all("report text" not in t for t in text_parts)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Bug detection: extract_documents must be called BEFORE _build_user_content
|
||||||
|
# to prevent document media from being silently dropped.
|
||||||
|
# This simulates the _drain_pending code path.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def test_drain_pending_path_preserves_document_text(tmp_path: Path) -> None:
|
||||||
|
"""Simulates the _drain_pending path: a pending follow-up message
|
||||||
|
with a document attachment must have its text extracted before being
|
||||||
|
passed to _build_user_content. Without extract_documents, the
|
||||||
|
document is silently dropped."""
|
||||||
|
from docx import Document
|
||||||
|
|
||||||
|
doc = Document()
|
||||||
|
doc.add_paragraph("Quarterly revenue is $5M")
|
||||||
|
docx_path = tmp_path / "report.docx"
|
||||||
|
doc.save(docx_path)
|
||||||
|
|
||||||
|
content = "summarize"
|
||||||
|
media = [str(docx_path)]
|
||||||
|
|
||||||
|
# Step 1: extract_documents separates docs from images
|
||||||
|
new_content, image_only = extract_documents(content, media)
|
||||||
|
|
||||||
|
# Step 2: _build_user_content handles only images (none left here)
|
||||||
|
builder = _make_builder(tmp_path)
|
||||||
|
result = builder._build_user_content(new_content, image_only if image_only else None)
|
||||||
|
|
||||||
|
# The document text should be present in the final content
|
||||||
|
assert "Quarterly revenue" in result
|
||||||
|
assert "summarize" in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_drain_pending_path_without_extract_loses_document(tmp_path: Path) -> None:
|
||||||
|
"""Demonstrates the BUG: if _drain_pending calls _build_user_content
|
||||||
|
directly without extract_documents, document content is lost."""
|
||||||
|
from docx import Document
|
||||||
|
|
||||||
|
doc = Document()
|
||||||
|
doc.add_paragraph("Secret data in document")
|
||||||
|
docx_path = tmp_path / "report.docx"
|
||||||
|
doc.save(docx_path)
|
||||||
|
|
||||||
|
builder = _make_builder(tmp_path)
|
||||||
|
|
||||||
|
# Bug path: call _build_user_content directly with document media
|
||||||
|
result = builder._build_user_content("summarize", [str(docx_path)])
|
||||||
|
|
||||||
|
# The document text is LOST — _build_user_content ignores non-images
|
||||||
|
assert result == "summarize" # only the original text, no doc content
|
||||||
|
assert "Secret data" not in result
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user