From f4d8783f5e5949536457d82e43d13cf3a1f7d9df Mon Sep 17 00:00:00 2001 From: Xubin Ren Date: Tue, 28 Apr 2026 07:25:47 +0000 Subject: [PATCH] test(web): cover configurable fetch behavior Ensure custom user agents are applied to direct web requests and disabling Jina Reader forces the local readability path. Made-with: Cursor --- nanobot/agent/subagent.py | 4 +- nanobot/agent/tools/web.py | 31 +++++++++---- tests/tools/test_web_fetch_security.py | 64 +++++++++++++++++++++++++- tests/tools/test_web_search_tool.py | 27 ++++++++--- 4 files changed, 108 insertions(+), 18 deletions(-) diff --git a/nanobot/agent/subagent.py b/nanobot/agent/subagent.py index 0eaad1c27..c100d205b 100644 --- a/nanobot/agent/subagent.py +++ b/nanobot/agent/subagent.py @@ -11,8 +11,7 @@ from typing import Any from loguru import logger from nanobot.agent.hook import AgentHook, AgentHookContext -from nanobot.utils.prompt_templates import render_template -from nanobot.agent.runner import AgentRunSpec, AgentRunner +from nanobot.agent.runner import AgentRunner, AgentRunSpec from nanobot.agent.skills import BUILTIN_SKILLS_DIR from nanobot.agent.tools.filesystem import EditFileTool, ListDirTool, ReadFileTool, WriteFileTool from nanobot.agent.tools.registry import ToolRegistry @@ -23,6 +22,7 @@ from nanobot.bus.events import InboundMessage from nanobot.bus.queue import MessageBus from nanobot.config.schema import ExecToolConfig, WebToolsConfig from nanobot.providers.base import LLMProvider +from nanobot.utils.prompt_templates import render_template @dataclass(slots=True) diff --git a/nanobot/agent/tools/web.py b/nanobot/agent/tools/web.py index 26052b87e..28a4c557f 100644 --- a/nanobot/agent/tools/web.py +++ b/nanobot/agent/tools/web.py @@ -18,7 +18,7 @@ from nanobot.agent.tools.schema import IntegerSchema, StringSchema, tool_paramet from nanobot.utils.helpers import build_image_content_blocks if TYPE_CHECKING: - from nanobot.config.schema import WebSearchConfig, WebFetchConfig + from nanobot.config.schema import WebFetchConfig, WebSearchConfig # Shared constants _DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36" @@ -159,7 +159,11 @@ class WebSearchTool(Tool): r = await client.get( "https://api.search.brave.com/res/v1/web/search", params={"q": query, "count": n}, - headers={"Accept": "application/json", "X-Subscription-Token": api_key}, + headers={ + "Accept": "application/json", + "X-Subscription-Token": api_key, + "User-Agent": self.user_agent, + }, timeout=10.0, ) r.raise_for_status() @@ -180,7 +184,7 @@ class WebSearchTool(Tool): async with httpx.AsyncClient(proxy=self.proxy) as client: r = await client.post( "https://api.tavily.com/search", - headers={"Authorization": f"Bearer {api_key}"}, + headers={"Authorization": f"Bearer {api_key}", "User-Agent": self.user_agent}, json={"query": query, "max_results": n}, timeout=15.0, ) @@ -217,7 +221,11 @@ class WebSearchTool(Tool): logger.warning("JINA_API_KEY not set, falling back to DuckDuckGo") return await self._search_duckduckgo(query, n) try: - headers = {"Accept": "application/json", "Authorization": f"Bearer {api_key}"} + headers = { + "Accept": "application/json", + "Authorization": f"Bearer {api_key}", + "User-Agent": self.user_agent, + } encoded_query = quote(query, safe="") async with httpx.AsyncClient(proxy=self.proxy) as client: r = await client.get( @@ -246,7 +254,7 @@ class WebSearchTool(Tool): r = await client.get( "https://kagi.com/api/v0/search", params={"q": query, "limit": n}, - headers={"Authorization": f"Bot {api_key}"}, + headers={"Authorization": f"Bot {api_key}", "User-Agent": self.user_agent}, timeout=10.0, ) r.raise_for_status() @@ -316,8 +324,15 @@ class WebFetchTool(Tool): def read_only(self) -> bool: return True - async def execute(self, url: str, extractMode: str = "markdown", maxChars: int | None = None, **kwargs: Any) -> Any: - max_chars = maxChars or self.max_chars + async def execute( + self, + url: str, + extract_mode: str = "markdown", + max_chars: int | None = None, + **kwargs: Any, + ) -> Any: + extract_mode = kwargs.pop("extractMode", extract_mode) + max_chars = kwargs.pop("maxChars", max_chars) or self.max_chars is_valid, error_msg = _validate_url_safe(url) if not is_valid: return json.dumps({"error": f"URL validation failed: {error_msg}", "url": url}, ensure_ascii=False) @@ -344,7 +359,7 @@ class WebFetchTool(Tool): if self.config.use_jina_reader: result = await self._fetch_jina(url, max_chars) if result is None: - result = await self._fetch_readability(url, extractMode, max_chars) + result = await self._fetch_readability(url, extract_mode, max_chars) return result async def _fetch_jina(self, url: str, max_chars: int) -> str | None: diff --git a/tests/tools/test_web_fetch_security.py b/tests/tools/test_web_fetch_security.py index dbdf2340a..58664cf33 100644 --- a/tests/tools/test_web_fetch_security.py +++ b/tests/tools/test_web_fetch_security.py @@ -9,6 +9,7 @@ from unittest.mock import patch import pytest from nanobot.agent.tools.web import WebFetchTool +from nanobot.config.schema import WebFetchConfig def _fake_resolve_private(hostname, port, family=0, type_=0): @@ -47,7 +48,6 @@ async def test_web_fetch_result_contains_untrusted_flag(): fake_html = "Test

Hello world

" - import httpx class FakeResponse: status_code = 200 @@ -69,6 +69,68 @@ async def test_web_fetch_result_contains_untrusted_flag(): assert "[External content" in data.get("text", "") +@pytest.mark.asyncio +async def test_web_fetch_can_skip_jina_and_use_custom_user_agent(monkeypatch): + tool = WebFetchTool( + config=WebFetchConfig(use_jina_reader=False), + user_agent="nanobot-test-agent", + ) + seen_headers: list[dict] = [] + + async def _fail_jina(*args, **kwargs): + raise AssertionError("Jina Reader should be skipped when disabled") + + class FakeStreamResponse: + headers = {"content-type": "text/html"} + url = "https://example.com/page" + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return False + + class FakeResponse: + status_code = 200 + url = "https://example.com/page" + text = "Test

Hello world

" + headers = {"content-type": "text/html"} + + def raise_for_status(self): + return None + + class FakeClient: + def __init__(self, *args, **kwargs): + pass + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return False + + def stream(self, method, url, headers=None): + seen_headers.append(headers or {}) + return FakeStreamResponse() + + async def get(self, url, headers=None): + seen_headers.append(headers or {}) + return FakeResponse() + + monkeypatch.setattr(tool, "_fetch_jina", _fail_jina) + monkeypatch.setattr("nanobot.agent.tools.web.httpx.AsyncClient", FakeClient) + + with patch("nanobot.security.network.socket.getaddrinfo", _fake_resolve_public): + result = await tool.execute(url="https://example.com/page") + + data = json.loads(result) + assert data["extractor"] == "readability" + assert [headers["User-Agent"] for headers in seen_headers] == [ + "nanobot-test-agent", + "nanobot-test-agent", + ] + + @pytest.mark.asyncio async def test_web_fetch_blocks_private_redirect_before_returning_image(monkeypatch): tool = WebFetchTool() diff --git a/tests/tools/test_web_search_tool.py b/tests/tools/test_web_search_tool.py index a42e51e1a..116d4db09 100644 --- a/tests/tools/test_web_search_tool.py +++ b/tests/tools/test_web_search_tool.py @@ -7,8 +7,16 @@ from nanobot.agent.tools.web import WebSearchTool from nanobot.config.schema import WebSearchConfig -def _tool(provider: str = "brave", api_key: str = "", base_url: str = "") -> WebSearchTool: - return WebSearchTool(config=WebSearchConfig(provider=provider, api_key=api_key, base_url=base_url)) +def _tool( + provider: str = "brave", + api_key: str = "", + base_url: str = "", + user_agent: str | None = None, +) -> WebSearchTool: + return WebSearchTool( + config=WebSearchConfig(provider=provider, api_key=api_key, base_url=base_url), + user_agent=user_agent, + ) def _response(status: int = 200, json: dict | None = None) -> httpx.Response: @@ -42,12 +50,13 @@ async def test_brave_search(monkeypatch): async def mock_get(self, url, **kw): assert "brave" in url assert kw["headers"]["X-Subscription-Token"] == "brave-key" + assert kw["headers"]["User-Agent"] == "nanobot-search-test" return _response(json={ "web": {"results": [{"title": "NanoBot", "url": "https://example.com", "description": "AI assistant"}]} }) monkeypatch.setattr(httpx.AsyncClient, "get", mock_get) - tool = _tool(provider="brave", api_key="brave-key") + tool = _tool(provider="brave", api_key="brave-key", user_agent="nanobot-search-test") result = await tool.execute(query="nanobot", count=1) assert "NanoBot" in result assert "https://example.com" in result @@ -58,12 +67,13 @@ async def test_tavily_search(monkeypatch): async def mock_post(self, url, **kw): assert "tavily" in url assert kw["headers"]["Authorization"] == "Bearer tavily-key" + assert kw["headers"]["User-Agent"] == "nanobot-search-test" return _response(json={ "results": [{"title": "OpenClaw", "url": "https://openclaw.io", "content": "Framework"}] }) monkeypatch.setattr(httpx.AsyncClient, "post", mock_post) - tool = _tool(provider="tavily", api_key="tavily-key") + tool = _tool(provider="tavily", api_key="tavily-key", user_agent="nanobot-search-test") result = await tool.execute(query="openclaw") assert "OpenClaw" in result assert "https://openclaw.io" in result @@ -73,12 +83,13 @@ async def test_tavily_search(monkeypatch): async def test_searxng_search(monkeypatch): async def mock_get(self, url, **kw): assert "searx.example" in url + assert kw["headers"]["User-Agent"] == "nanobot-search-test" return _response(json={ "results": [{"title": "Result", "url": "https://example.com", "content": "SearXNG result"}] }) monkeypatch.setattr(httpx.AsyncClient, "get", mock_get) - tool = _tool(provider="searxng", base_url="https://searx.example") + tool = _tool(provider="searxng", base_url="https://searx.example", user_agent="nanobot-search-test") result = await tool.execute(query="test") assert "Result" in result @@ -125,12 +136,13 @@ async def test_jina_search(monkeypatch): async def mock_get(self, url, **kw): assert "s.jina.ai" in str(url) assert kw["headers"]["Authorization"] == "Bearer jina-key" + assert kw["headers"]["User-Agent"] == "nanobot-search-test" return _response(json={ "data": [{"title": "Jina Result", "url": "https://jina.ai", "content": "AI search"}] }) monkeypatch.setattr(httpx.AsyncClient, "get", mock_get) - tool = _tool(provider="jina", api_key="jina-key") + tool = _tool(provider="jina", api_key="jina-key", user_agent="nanobot-search-test") result = await tool.execute(query="test") assert "Jina Result" in result assert "https://jina.ai" in result @@ -141,6 +153,7 @@ async def test_kagi_search(monkeypatch): async def mock_get(self, url, **kw): assert "kagi.com/api/v0/search" in url assert kw["headers"]["Authorization"] == "Bot kagi-key" + assert kw["headers"]["User-Agent"] == "nanobot-search-test" assert kw["params"] == {"q": "test", "limit": 2} return _response(json={ "data": [ @@ -150,7 +163,7 @@ async def test_kagi_search(monkeypatch): }) monkeypatch.setattr(httpx.AsyncClient, "get", mock_get) - tool = _tool(provider="kagi", api_key="kagi-key") + tool = _tool(provider="kagi", api_key="kagi-key", user_agent="nanobot-search-test") result = await tool.execute(query="test", count=2) assert "Kagi Result" in result assert "https://kagi.com" in result