mirror of
https://github.com/HKUDS/nanobot.git
synced 2026-05-22 17:42:24 +00:00
feat(read_file): add DOCX, XLSX, PPTX support via document.extract_text()
Wire up the existing office document extractors in document.py to ReadFileTool by adding an extension guard and _read_office_doc() method that follows the established PDF pattern. Handles missing libraries, corrupt files, empty documents, and 128K truncation consistently.
This commit is contained in:
parent
46864b0911
commit
53ba410e49
@ -137,10 +137,11 @@ class ReadFileTool(_FsTool):
|
|||||||
@property
|
@property
|
||||||
def description(self) -> str:
|
def description(self) -> str:
|
||||||
return (
|
return (
|
||||||
"Read a file (text or image). Text output format: LINE_NUM|CONTENT. "
|
"Read a file (text, image, or document). "
|
||||||
|
"Text output format: LINE_NUM|CONTENT. "
|
||||||
"Images return visual content for analysis. "
|
"Images return visual content for analysis. "
|
||||||
"Use offset and limit for large files. "
|
"Supports PDF, DOCX, XLSX, PPTX documents. "
|
||||||
"Cannot read non-image binary files. "
|
"Use offset and limit for large text files. "
|
||||||
"Reads exceeding ~128K chars are truncated."
|
"Reads exceeding ~128K chars are truncated."
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -169,6 +170,10 @@ class ReadFileTool(_FsTool):
|
|||||||
if fp.suffix.lower() == ".pdf":
|
if fp.suffix.lower() == ".pdf":
|
||||||
return self._read_pdf(fp, pages)
|
return self._read_pdf(fp, pages)
|
||||||
|
|
||||||
|
# Office document support
|
||||||
|
if fp.suffix.lower() in {".docx", ".xlsx", ".pptx"}:
|
||||||
|
return self._read_office_doc(fp)
|
||||||
|
|
||||||
raw = fp.read_bytes()
|
raw = fp.read_bytes()
|
||||||
if not raw:
|
if not raw:
|
||||||
return f"(Empty file: {path})"
|
return f"(Empty file: {path})"
|
||||||
@ -304,6 +309,25 @@ class ReadFileTool(_FsTool):
|
|||||||
result = result[:self._MAX_CHARS] + "\n\n(PDF text truncated at ~128K chars)"
|
result = result[:self._MAX_CHARS] + "\n\n(PDF text truncated at ~128K chars)"
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def _read_office_doc(self, fp: Path) -> str:
|
||||||
|
from nanobot.utils.document import extract_text
|
||||||
|
|
||||||
|
result = extract_text(fp)
|
||||||
|
|
||||||
|
if result is None:
|
||||||
|
return f"Error: Unsupported file format: {fp.suffix}"
|
||||||
|
|
||||||
|
if result.startswith("[error:"):
|
||||||
|
return f"Error reading {fp.suffix.upper()} file: {result}"
|
||||||
|
|
||||||
|
if not result:
|
||||||
|
return f"({fp.suffix.upper().lstrip('.')} has no extractable text: {fp})"
|
||||||
|
|
||||||
|
if len(result) > self._MAX_CHARS:
|
||||||
|
result = result[:self._MAX_CHARS] + "\n\n(Document text truncated at ~128K chars)"
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# write_file
|
# write_file
|
||||||
|
|||||||
@ -1,7 +1,8 @@
|
|||||||
"""Tests for ReadFileTool enhancements: description fix, read dedup, PDF support, device blacklist."""
|
"""Tests for ReadFileTool enhancements: description fix, read dedup, PDF support, device blacklist, office docs."""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@ -246,3 +247,123 @@ class TestReadFileLineEndingNormalization:
|
|||||||
result = await tool.execute(path=str(f))
|
result = await tool.execute(path=str(f))
|
||||||
assert "\r" not in result
|
assert "\r" not in result
|
||||||
assert "alpha" in result and "beta" in result and "gamma" in result
|
assert "alpha" in result and "beta" in result and "gamma" in result
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Office document support (DOCX, XLSX, PPTX)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestReadOfficeDocuments:
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def tool(self, tmp_path):
|
||||||
|
return ReadFileTool(workspace=tmp_path)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_docx_returns_extracted_text(self, tool, tmp_path):
|
||||||
|
with patch("nanobot.utils.document.extract_text", return_value="Title\n\nParagraph 1"):
|
||||||
|
f = tmp_path / "test.docx"
|
||||||
|
f.write_bytes(b"PK")
|
||||||
|
result = await tool.execute(path=str(f))
|
||||||
|
assert "Title" in result
|
||||||
|
assert "Paragraph 1" in result
|
||||||
|
assert "Error" not in result
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_xlsx_returns_extracted_text(self, tool, tmp_path):
|
||||||
|
with patch("nanobot.utils.document.extract_text", return_value="--- Sheet: Sheet1 ---\nName\tAge\nAlice\t30"):
|
||||||
|
f = tmp_path / "test.xlsx"
|
||||||
|
f.write_bytes(b"PK")
|
||||||
|
result = await tool.execute(path=str(f))
|
||||||
|
assert "Sheet1" in result
|
||||||
|
assert "Alice" in result
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_pptx_returns_extracted_text(self, tool, tmp_path):
|
||||||
|
with patch("nanobot.utils.document.extract_text", return_value="--- Slide 1 ---\nWelcome\n--- Slide 2 ---\nContent"):
|
||||||
|
f = tmp_path / "test.pptx"
|
||||||
|
f.write_bytes(b"PK")
|
||||||
|
result = await tool.execute(path=str(f))
|
||||||
|
assert "Welcome" in result
|
||||||
|
assert "Content" in result
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_docx_missing_library(self, tool, tmp_path):
|
||||||
|
with patch("nanobot.utils.document.extract_text", return_value="[error: python-docx not installed]"):
|
||||||
|
f = tmp_path / "test.docx"
|
||||||
|
f.write_bytes(b"PK")
|
||||||
|
result = await tool.execute(path=str(f))
|
||||||
|
assert "Error" in result
|
||||||
|
assert "python-docx not installed" in result
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_docx_corrupt_file(self, tool, tmp_path):
|
||||||
|
with patch("nanobot.utils.document.extract_text", return_value="[error: failed to extract DOCX: bad zip]"):
|
||||||
|
f = tmp_path / "test.docx"
|
||||||
|
f.write_bytes(b"not-a-zip")
|
||||||
|
result = await tool.execute(path=str(f))
|
||||||
|
assert "Error" in result
|
||||||
|
assert "failed to extract DOCX" in result
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_unsupported_extension(self, tool, tmp_path):
|
||||||
|
with patch("nanobot.utils.document.extract_text", return_value=None):
|
||||||
|
f = tmp_path / "test.docx"
|
||||||
|
f.write_bytes(b"PK")
|
||||||
|
result = await tool.execute(path=str(f))
|
||||||
|
assert "Error" in result
|
||||||
|
assert "Unsupported" in result
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_empty_document_returns_descriptive_message(self, tool, tmp_path):
|
||||||
|
with patch("nanobot.utils.document.extract_text", return_value=""):
|
||||||
|
f = tmp_path / "empty.docx"
|
||||||
|
f.write_bytes(b"PK")
|
||||||
|
result = await tool.execute(path=str(f))
|
||||||
|
assert "no extractable text" in result
|
||||||
|
|
||||||
|
|
||||||
|
class TestOfficeDocTruncation:
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def tool(self, tmp_path):
|
||||||
|
return ReadFileTool(workspace=tmp_path)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_large_document_truncated(self, tool, tmp_path):
|
||||||
|
with patch("nanobot.utils.document.extract_text", return_value="x" * 200_000):
|
||||||
|
f = tmp_path / "large.docx"
|
||||||
|
f.write_bytes(b"PK")
|
||||||
|
result = await tool.execute(path=str(f))
|
||||||
|
assert len(result) <= ReadFileTool._MAX_CHARS + 100
|
||||||
|
assert "truncated at ~128K chars" in result
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_small_document_not_truncated(self, tool, tmp_path):
|
||||||
|
with patch("nanobot.utils.document.extract_text", return_value="Hello world"):
|
||||||
|
f = tmp_path / "small.docx"
|
||||||
|
f.write_bytes(b"PK")
|
||||||
|
result = await tool.execute(path=str(f))
|
||||||
|
assert "truncated" not in result
|
||||||
|
assert "Hello world" in result
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_error_response_not_truncated(self, tool, tmp_path):
|
||||||
|
with patch("nanobot.utils.document.extract_text", return_value="[error: failed to extract DOCX: something went wrong]"):
|
||||||
|
f = tmp_path / "bad.docx"
|
||||||
|
f.write_bytes(b"PK")
|
||||||
|
result = await tool.execute(path=str(f))
|
||||||
|
assert "Error" in result
|
||||||
|
assert "truncated" not in result
|
||||||
|
|
||||||
|
|
||||||
|
class TestReadDescriptionUpdate:
|
||||||
|
|
||||||
|
def test_description_mentions_documents(self):
|
||||||
|
tool = ReadFileTool()
|
||||||
|
desc = tool.description.lower()
|
||||||
|
assert "document" in desc or "docx" in desc or "xlsx" in desc or "pptx" in desc
|
||||||
|
|
||||||
|
def test_description_no_longer_says_cannot_read(self):
|
||||||
|
tool = ReadFileTool()
|
||||||
|
assert "cannot read" not in tool.description.lower()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user