From 53ba410e4994b5092c9ea1941ac1baf5846d181a Mon Sep 17 00:00:00 2001 From: aiguozhi123456 <126325311+aiguozhi123456@users.noreply.github.com> Date: Mon, 20 Apr 2026 23:57:47 +0800 Subject: [PATCH] feat(read_file): add DOCX, XLSX, PPTX support via document.extract_text() Wire up the existing office document extractors in document.py to ReadFileTool by adding an extension guard and _read_office_doc() method that follows the established PDF pattern. Handles missing libraries, corrupt files, empty documents, and 128K truncation consistently. --- nanobot/agent/tools/filesystem.py | 30 ++++++- tests/tools/test_read_enhancements.py | 123 +++++++++++++++++++++++++- 2 files changed, 149 insertions(+), 4 deletions(-) diff --git a/nanobot/agent/tools/filesystem.py b/nanobot/agent/tools/filesystem.py index 1f3afd341..c0d628a71 100644 --- a/nanobot/agent/tools/filesystem.py +++ b/nanobot/agent/tools/filesystem.py @@ -137,10 +137,11 @@ class ReadFileTool(_FsTool): @property def description(self) -> str: return ( - "Read a file (text or image). Text output format: LINE_NUM|CONTENT. " + "Read a file (text, image, or document). " + "Text output format: LINE_NUM|CONTENT. " "Images return visual content for analysis. " - "Use offset and limit for large files. " - "Cannot read non-image binary files. " + "Supports PDF, DOCX, XLSX, PPTX documents. " + "Use offset and limit for large text files. " "Reads exceeding ~128K chars are truncated." ) @@ -169,6 +170,10 @@ class ReadFileTool(_FsTool): if fp.suffix.lower() == ".pdf": return self._read_pdf(fp, pages) + # Office document support + if fp.suffix.lower() in {".docx", ".xlsx", ".pptx"}: + return self._read_office_doc(fp) + raw = fp.read_bytes() if not raw: return f"(Empty file: {path})" @@ -304,6 +309,25 @@ class ReadFileTool(_FsTool): result = result[:self._MAX_CHARS] + "\n\n(PDF text truncated at ~128K chars)" return result + def _read_office_doc(self, fp: Path) -> str: + from nanobot.utils.document import extract_text + + result = extract_text(fp) + + if result is None: + return f"Error: Unsupported file format: {fp.suffix}" + + if result.startswith("[error:"): + return f"Error reading {fp.suffix.upper()} file: {result}" + + if not result: + return f"({fp.suffix.upper().lstrip('.')} has no extractable text: {fp})" + + if len(result) > self._MAX_CHARS: + result = result[:self._MAX_CHARS] + "\n\n(Document text truncated at ~128K chars)" + + return result + # --------------------------------------------------------------------------- # write_file diff --git a/tests/tools/test_read_enhancements.py b/tests/tools/test_read_enhancements.py index 0be123700..f7a62f05b 100644 --- a/tests/tools/test_read_enhancements.py +++ b/tests/tools/test_read_enhancements.py @@ -1,7 +1,8 @@ -"""Tests for ReadFileTool enhancements: description fix, read dedup, PDF support, device blacklist.""" +"""Tests for ReadFileTool enhancements: description fix, read dedup, PDF support, device blacklist, office docs.""" import os import sys +from unittest.mock import patch import pytest @@ -246,3 +247,123 @@ class TestReadFileLineEndingNormalization: result = await tool.execute(path=str(f)) assert "\r" not in result assert "alpha" in result and "beta" in result and "gamma" in result + + +# --------------------------------------------------------------------------- +# Office document support (DOCX, XLSX, PPTX) +# --------------------------------------------------------------------------- + +class TestReadOfficeDocuments: + + @pytest.fixture() + def tool(self, tmp_path): + return ReadFileTool(workspace=tmp_path) + + @pytest.mark.asyncio + async def test_docx_returns_extracted_text(self, tool, tmp_path): + with patch("nanobot.utils.document.extract_text", return_value="Title\n\nParagraph 1"): + f = tmp_path / "test.docx" + f.write_bytes(b"PK") + result = await tool.execute(path=str(f)) + assert "Title" in result + assert "Paragraph 1" in result + assert "Error" not in result + + @pytest.mark.asyncio + async def test_xlsx_returns_extracted_text(self, tool, tmp_path): + with patch("nanobot.utils.document.extract_text", return_value="--- Sheet: Sheet1 ---\nName\tAge\nAlice\t30"): + f = tmp_path / "test.xlsx" + f.write_bytes(b"PK") + result = await tool.execute(path=str(f)) + assert "Sheet1" in result + assert "Alice" in result + + @pytest.mark.asyncio + async def test_pptx_returns_extracted_text(self, tool, tmp_path): + with patch("nanobot.utils.document.extract_text", return_value="--- Slide 1 ---\nWelcome\n--- Slide 2 ---\nContent"): + f = tmp_path / "test.pptx" + f.write_bytes(b"PK") + result = await tool.execute(path=str(f)) + assert "Welcome" in result + assert "Content" in result + + @pytest.mark.asyncio + async def test_docx_missing_library(self, tool, tmp_path): + with patch("nanobot.utils.document.extract_text", return_value="[error: python-docx not installed]"): + f = tmp_path / "test.docx" + f.write_bytes(b"PK") + result = await tool.execute(path=str(f)) + assert "Error" in result + assert "python-docx not installed" in result + + @pytest.mark.asyncio + async def test_docx_corrupt_file(self, tool, tmp_path): + with patch("nanobot.utils.document.extract_text", return_value="[error: failed to extract DOCX: bad zip]"): + f = tmp_path / "test.docx" + f.write_bytes(b"not-a-zip") + result = await tool.execute(path=str(f)) + assert "Error" in result + assert "failed to extract DOCX" in result + + @pytest.mark.asyncio + async def test_unsupported_extension(self, tool, tmp_path): + with patch("nanobot.utils.document.extract_text", return_value=None): + f = tmp_path / "test.docx" + f.write_bytes(b"PK") + result = await tool.execute(path=str(f)) + assert "Error" in result + assert "Unsupported" in result + + @pytest.mark.asyncio + async def test_empty_document_returns_descriptive_message(self, tool, tmp_path): + with patch("nanobot.utils.document.extract_text", return_value=""): + f = tmp_path / "empty.docx" + f.write_bytes(b"PK") + result = await tool.execute(path=str(f)) + assert "no extractable text" in result + + +class TestOfficeDocTruncation: + + @pytest.fixture() + def tool(self, tmp_path): + return ReadFileTool(workspace=tmp_path) + + @pytest.mark.asyncio + async def test_large_document_truncated(self, tool, tmp_path): + with patch("nanobot.utils.document.extract_text", return_value="x" * 200_000): + f = tmp_path / "large.docx" + f.write_bytes(b"PK") + result = await tool.execute(path=str(f)) + assert len(result) <= ReadFileTool._MAX_CHARS + 100 + assert "truncated at ~128K chars" in result + + @pytest.mark.asyncio + async def test_small_document_not_truncated(self, tool, tmp_path): + with patch("nanobot.utils.document.extract_text", return_value="Hello world"): + f = tmp_path / "small.docx" + f.write_bytes(b"PK") + result = await tool.execute(path=str(f)) + assert "truncated" not in result + assert "Hello world" in result + + @pytest.mark.asyncio + async def test_error_response_not_truncated(self, tool, tmp_path): + with patch("nanobot.utils.document.extract_text", return_value="[error: failed to extract DOCX: something went wrong]"): + f = tmp_path / "bad.docx" + f.write_bytes(b"PK") + result = await tool.execute(path=str(f)) + assert "Error" in result + assert "truncated" not in result + + +class TestReadDescriptionUpdate: + + def test_description_mentions_documents(self): + tool = ReadFileTool() + desc = tool.description.lower() + assert "document" in desc or "docx" in desc or "xlsx" in desc or "pptx" in desc + + def test_description_no_longer_says_cannot_read(self): + tool = ReadFileTool() + assert "cannot read" not in tool.description.lower()