From 53ba410e4994b5092c9ea1941ac1baf5846d181a Mon Sep 17 00:00:00 2001
From: aiguozhi123456 <126325311+aiguozhi123456@users.noreply.github.com>
Date: Mon, 20 Apr 2026 23:57:47 +0800
Subject: [PATCH] feat(read_file): add DOCX, XLSX, PPTX support via
 document.extract_text()

Wire up the existing office document extractors in document.py to
ReadFileTool by adding an extension guard and _read_office_doc() method
that follows the established PDF pattern. Handles missing libraries,
corrupt files, empty documents, and 128K truncation consistently.
---
 nanobot/agent/tools/filesystem.py     |  30 ++++++-
 tests/tools/test_read_enhancements.py | 123 +++++++++++++++++++++++++-
 2 files changed, 149 insertions(+), 4 deletions(-)

diff --git a/nanobot/agent/tools/filesystem.py b/nanobot/agent/tools/filesystem.py
index 1f3afd341..c0d628a71 100644
--- a/nanobot/agent/tools/filesystem.py
+++ b/nanobot/agent/tools/filesystem.py
@@ -137,10 +137,11 @@ class ReadFileTool(_FsTool):
     @property
     def description(self) -> str:
         return (
-            "Read a file (text or image). Text output format: LINE_NUM|CONTENT. "
+            "Read a file (text, image, or document). "
+            "Text output format: LINE_NUM|CONTENT. "
             "Images return visual content for analysis. "
-            "Use offset and limit for large files. "
-            "Cannot read non-image binary files. "
+            "Supports PDF, DOCX, XLSX, PPTX documents. "
+            "Use offset and limit for large text files. "
             "Reads exceeding ~128K chars are truncated."
         )
 
@@ -169,6 +170,10 @@ class ReadFileTool(_FsTool):
             if fp.suffix.lower() == ".pdf":
                 return self._read_pdf(fp, pages)
 
+            # Office document support
+            if fp.suffix.lower() in {".docx", ".xlsx", ".pptx"}:
+                return self._read_office_doc(fp)
+
             raw = fp.read_bytes()
             if not raw:
                 return f"(Empty file: {path})"
@@ -304,6 +309,25 @@ class ReadFileTool(_FsTool):
             result = result[:self._MAX_CHARS] + "\n\n(PDF text truncated at ~128K chars)"
         return result
 
+    def _read_office_doc(self, fp: Path) -> str:
+        from nanobot.utils.document import extract_text
+
+        result = extract_text(fp)
+
+        if result is None:
+            return f"Error: Unsupported file format: {fp.suffix}"
+
+        if result.startswith("[error:"):
+            return f"Error reading {fp.suffix.upper()} file: {result}"
+
+        if not result:
+            return f"({fp.suffix.upper().lstrip('.')} has no extractable text: {fp})"
+
+        if len(result) > self._MAX_CHARS:
+            result = result[:self._MAX_CHARS] + "\n\n(Document text truncated at ~128K chars)"
+
+        return result
+
 
 # ---------------------------------------------------------------------------
 # write_file
diff --git a/tests/tools/test_read_enhancements.py b/tests/tools/test_read_enhancements.py
index 0be123700..f7a62f05b 100644
--- a/tests/tools/test_read_enhancements.py
+++ b/tests/tools/test_read_enhancements.py
@@ -1,7 +1,8 @@
-"""Tests for ReadFileTool enhancements: description fix, read dedup, PDF support, device blacklist."""
+"""Tests for ReadFileTool enhancements: description fix, read dedup, PDF support, device blacklist, office docs."""
 
 import os
 import sys
+from unittest.mock import patch
 
 import pytest
 
@@ -246,3 +247,123 @@ class TestReadFileLineEndingNormalization:
         result = await tool.execute(path=str(f))
         assert "\r" not in result
         assert "alpha" in result and "beta" in result and "gamma" in result
+
+
+# ---------------------------------------------------------------------------
+# Office document support (DOCX, XLSX, PPTX)
+# ---------------------------------------------------------------------------
+
+class TestReadOfficeDocuments:
+
+    @pytest.fixture()
+    def tool(self, tmp_path):
+        return ReadFileTool(workspace=tmp_path)
+
+    @pytest.mark.asyncio
+    async def test_docx_returns_extracted_text(self, tool, tmp_path):
+        with patch("nanobot.utils.document.extract_text", return_value="Title\n\nParagraph 1"):
+            f = tmp_path / "test.docx"
+            f.write_bytes(b"PK")
+            result = await tool.execute(path=str(f))
+        assert "Title" in result
+        assert "Paragraph 1" in result
+        assert "Error" not in result
+
+    @pytest.mark.asyncio
+    async def test_xlsx_returns_extracted_text(self, tool, tmp_path):
+        with patch("nanobot.utils.document.extract_text", return_value="--- Sheet: Sheet1 ---\nName\tAge\nAlice\t30"):
+            f = tmp_path / "test.xlsx"
+            f.write_bytes(b"PK")
+            result = await tool.execute(path=str(f))
+        assert "Sheet1" in result
+        assert "Alice" in result
+
+    @pytest.mark.asyncio
+    async def test_pptx_returns_extracted_text(self, tool, tmp_path):
+        with patch("nanobot.utils.document.extract_text", return_value="--- Slide 1 ---\nWelcome\n--- Slide 2 ---\nContent"):
+            f = tmp_path / "test.pptx"
+            f.write_bytes(b"PK")
+            result = await tool.execute(path=str(f))
+        assert "Welcome" in result
+        assert "Content" in result
+
+    @pytest.mark.asyncio
+    async def test_docx_missing_library(self, tool, tmp_path):
+        with patch("nanobot.utils.document.extract_text", return_value="[error: python-docx not installed]"):
+            f = tmp_path / "test.docx"
+            f.write_bytes(b"PK")
+            result = await tool.execute(path=str(f))
+        assert "Error" in result
+        assert "python-docx not installed" in result
+
+    @pytest.mark.asyncio
+    async def test_docx_corrupt_file(self, tool, tmp_path):
+        with patch("nanobot.utils.document.extract_text", return_value="[error: failed to extract DOCX: bad zip]"):
+            f = tmp_path / "test.docx"
+            f.write_bytes(b"not-a-zip")
+            result = await tool.execute(path=str(f))
+        assert "Error" in result
+        assert "failed to extract DOCX" in result
+
+    @pytest.mark.asyncio
+    async def test_unsupported_extension(self, tool, tmp_path):
+        with patch("nanobot.utils.document.extract_text", return_value=None):
+            f = tmp_path / "test.docx"
+            f.write_bytes(b"PK")
+            result = await tool.execute(path=str(f))
+        assert "Error" in result
+        assert "Unsupported" in result
+
+    @pytest.mark.asyncio
+    async def test_empty_document_returns_descriptive_message(self, tool, tmp_path):
+        with patch("nanobot.utils.document.extract_text", return_value=""):
+            f = tmp_path / "empty.docx"
+            f.write_bytes(b"PK")
+            result = await tool.execute(path=str(f))
+        assert "no extractable text" in result
+
+
+class TestOfficeDocTruncation:
+
+    @pytest.fixture()
+    def tool(self, tmp_path):
+        return ReadFileTool(workspace=tmp_path)
+
+    @pytest.mark.asyncio
+    async def test_large_document_truncated(self, tool, tmp_path):
+        with patch("nanobot.utils.document.extract_text", return_value="x" * 200_000):
+            f = tmp_path / "large.docx"
+            f.write_bytes(b"PK")
+            result = await tool.execute(path=str(f))
+        assert len(result) <= ReadFileTool._MAX_CHARS + 100
+        assert "truncated at ~128K chars" in result
+
+    @pytest.mark.asyncio
+    async def test_small_document_not_truncated(self, tool, tmp_path):
+        with patch("nanobot.utils.document.extract_text", return_value="Hello world"):
+            f = tmp_path / "small.docx"
+            f.write_bytes(b"PK")
+            result = await tool.execute(path=str(f))
+        assert "truncated" not in result
+        assert "Hello world" in result
+
+    @pytest.mark.asyncio
+    async def test_error_response_not_truncated(self, tool, tmp_path):
+        with patch("nanobot.utils.document.extract_text", return_value="[error: failed to extract DOCX: something went wrong]"):
+            f = tmp_path / "bad.docx"
+            f.write_bytes(b"PK")
+            result = await tool.execute(path=str(f))
+        assert "Error" in result
+        assert "truncated" not in result
+
+
+class TestReadDescriptionUpdate:
+
+    def test_description_mentions_documents(self):
+        tool = ReadFileTool()
+        desc = tool.description.lower()
+        assert "document" in desc or "docx" in desc or "xlsx" in desc or "pptx" in desc
+
+    def test_description_no_longer_says_cannot_read(self):
+        tool = ReadFileTool()
+        assert "cannot read" not in tool.description.lower()