test(web): cover configurable fetch behavior

Ensure custom user agents are applied to direct web requests and disabling Jina Reader forces the local readability path.

Made-with: Cursor
This commit is contained in:
Xubin Ren 2026-04-28 07:25:47 +00:00
parent 18432c313f
commit f4d8783f5e
4 changed files with 108 additions and 18 deletions

View File

@ -11,8 +11,7 @@ from typing import Any
from loguru import logger from loguru import logger
from nanobot.agent.hook import AgentHook, AgentHookContext from nanobot.agent.hook import AgentHook, AgentHookContext
from nanobot.utils.prompt_templates import render_template from nanobot.agent.runner import AgentRunner, AgentRunSpec
from nanobot.agent.runner import AgentRunSpec, AgentRunner
from nanobot.agent.skills import BUILTIN_SKILLS_DIR from nanobot.agent.skills import BUILTIN_SKILLS_DIR
from nanobot.agent.tools.filesystem import EditFileTool, ListDirTool, ReadFileTool, WriteFileTool from nanobot.agent.tools.filesystem import EditFileTool, ListDirTool, ReadFileTool, WriteFileTool
from nanobot.agent.tools.registry import ToolRegistry from nanobot.agent.tools.registry import ToolRegistry
@ -23,6 +22,7 @@ from nanobot.bus.events import InboundMessage
from nanobot.bus.queue import MessageBus from nanobot.bus.queue import MessageBus
from nanobot.config.schema import ExecToolConfig, WebToolsConfig from nanobot.config.schema import ExecToolConfig, WebToolsConfig
from nanobot.providers.base import LLMProvider from nanobot.providers.base import LLMProvider
from nanobot.utils.prompt_templates import render_template
@dataclass(slots=True) @dataclass(slots=True)

View File

@ -18,7 +18,7 @@ from nanobot.agent.tools.schema import IntegerSchema, StringSchema, tool_paramet
from nanobot.utils.helpers import build_image_content_blocks from nanobot.utils.helpers import build_image_content_blocks
if TYPE_CHECKING: if TYPE_CHECKING:
from nanobot.config.schema import WebSearchConfig, WebFetchConfig from nanobot.config.schema import WebFetchConfig, WebSearchConfig
# Shared constants # Shared constants
_DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36" _DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36"
@ -159,7 +159,11 @@ class WebSearchTool(Tool):
r = await client.get( r = await client.get(
"https://api.search.brave.com/res/v1/web/search", "https://api.search.brave.com/res/v1/web/search",
params={"q": query, "count": n}, params={"q": query, "count": n},
headers={"Accept": "application/json", "X-Subscription-Token": api_key}, headers={
"Accept": "application/json",
"X-Subscription-Token": api_key,
"User-Agent": self.user_agent,
},
timeout=10.0, timeout=10.0,
) )
r.raise_for_status() r.raise_for_status()
@ -180,7 +184,7 @@ class WebSearchTool(Tool):
async with httpx.AsyncClient(proxy=self.proxy) as client: async with httpx.AsyncClient(proxy=self.proxy) as client:
r = await client.post( r = await client.post(
"https://api.tavily.com/search", "https://api.tavily.com/search",
headers={"Authorization": f"Bearer {api_key}"}, headers={"Authorization": f"Bearer {api_key}", "User-Agent": self.user_agent},
json={"query": query, "max_results": n}, json={"query": query, "max_results": n},
timeout=15.0, timeout=15.0,
) )
@ -217,7 +221,11 @@ class WebSearchTool(Tool):
logger.warning("JINA_API_KEY not set, falling back to DuckDuckGo") logger.warning("JINA_API_KEY not set, falling back to DuckDuckGo")
return await self._search_duckduckgo(query, n) return await self._search_duckduckgo(query, n)
try: try:
headers = {"Accept": "application/json", "Authorization": f"Bearer {api_key}"} headers = {
"Accept": "application/json",
"Authorization": f"Bearer {api_key}",
"User-Agent": self.user_agent,
}
encoded_query = quote(query, safe="") encoded_query = quote(query, safe="")
async with httpx.AsyncClient(proxy=self.proxy) as client: async with httpx.AsyncClient(proxy=self.proxy) as client:
r = await client.get( r = await client.get(
@ -246,7 +254,7 @@ class WebSearchTool(Tool):
r = await client.get( r = await client.get(
"https://kagi.com/api/v0/search", "https://kagi.com/api/v0/search",
params={"q": query, "limit": n}, params={"q": query, "limit": n},
headers={"Authorization": f"Bot {api_key}"}, headers={"Authorization": f"Bot {api_key}", "User-Agent": self.user_agent},
timeout=10.0, timeout=10.0,
) )
r.raise_for_status() r.raise_for_status()
@ -316,8 +324,15 @@ class WebFetchTool(Tool):
def read_only(self) -> bool: def read_only(self) -> bool:
return True return True
async def execute(self, url: str, extractMode: str = "markdown", maxChars: int | None = None, **kwargs: Any) -> Any: async def execute(
max_chars = maxChars or self.max_chars self,
url: str,
extract_mode: str = "markdown",
max_chars: int | None = None,
**kwargs: Any,
) -> Any:
extract_mode = kwargs.pop("extractMode", extract_mode)
max_chars = kwargs.pop("maxChars", max_chars) or self.max_chars
is_valid, error_msg = _validate_url_safe(url) is_valid, error_msg = _validate_url_safe(url)
if not is_valid: if not is_valid:
return json.dumps({"error": f"URL validation failed: {error_msg}", "url": url}, ensure_ascii=False) return json.dumps({"error": f"URL validation failed: {error_msg}", "url": url}, ensure_ascii=False)
@ -344,7 +359,7 @@ class WebFetchTool(Tool):
if self.config.use_jina_reader: if self.config.use_jina_reader:
result = await self._fetch_jina(url, max_chars) result = await self._fetch_jina(url, max_chars)
if result is None: if result is None:
result = await self._fetch_readability(url, extractMode, max_chars) result = await self._fetch_readability(url, extract_mode, max_chars)
return result return result
async def _fetch_jina(self, url: str, max_chars: int) -> str | None: async def _fetch_jina(self, url: str, max_chars: int) -> str | None:

View File

@ -9,6 +9,7 @@ from unittest.mock import patch
import pytest import pytest
from nanobot.agent.tools.web import WebFetchTool from nanobot.agent.tools.web import WebFetchTool
from nanobot.config.schema import WebFetchConfig
def _fake_resolve_private(hostname, port, family=0, type_=0): def _fake_resolve_private(hostname, port, family=0, type_=0):
@ -47,7 +48,6 @@ async def test_web_fetch_result_contains_untrusted_flag():
fake_html = "<html><head><title>Test</title></head><body><p>Hello world</p></body></html>" fake_html = "<html><head><title>Test</title></head><body><p>Hello world</p></body></html>"
import httpx
class FakeResponse: class FakeResponse:
status_code = 200 status_code = 200
@ -69,6 +69,68 @@ async def test_web_fetch_result_contains_untrusted_flag():
assert "[External content" in data.get("text", "") assert "[External content" in data.get("text", "")
@pytest.mark.asyncio
async def test_web_fetch_can_skip_jina_and_use_custom_user_agent(monkeypatch):
tool = WebFetchTool(
config=WebFetchConfig(use_jina_reader=False),
user_agent="nanobot-test-agent",
)
seen_headers: list[dict] = []
async def _fail_jina(*args, **kwargs):
raise AssertionError("Jina Reader should be skipped when disabled")
class FakeStreamResponse:
headers = {"content-type": "text/html"}
url = "https://example.com/page"
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc, tb):
return False
class FakeResponse:
status_code = 200
url = "https://example.com/page"
text = "<html><head><title>Test</title></head><body><p>Hello world</p></body></html>"
headers = {"content-type": "text/html"}
def raise_for_status(self):
return None
class FakeClient:
def __init__(self, *args, **kwargs):
pass
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc, tb):
return False
def stream(self, method, url, headers=None):
seen_headers.append(headers or {})
return FakeStreamResponse()
async def get(self, url, headers=None):
seen_headers.append(headers or {})
return FakeResponse()
monkeypatch.setattr(tool, "_fetch_jina", _fail_jina)
monkeypatch.setattr("nanobot.agent.tools.web.httpx.AsyncClient", FakeClient)
with patch("nanobot.security.network.socket.getaddrinfo", _fake_resolve_public):
result = await tool.execute(url="https://example.com/page")
data = json.loads(result)
assert data["extractor"] == "readability"
assert [headers["User-Agent"] for headers in seen_headers] == [
"nanobot-test-agent",
"nanobot-test-agent",
]
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_web_fetch_blocks_private_redirect_before_returning_image(monkeypatch): async def test_web_fetch_blocks_private_redirect_before_returning_image(monkeypatch):
tool = WebFetchTool() tool = WebFetchTool()

View File

@ -7,8 +7,16 @@ from nanobot.agent.tools.web import WebSearchTool
from nanobot.config.schema import WebSearchConfig from nanobot.config.schema import WebSearchConfig
def _tool(provider: str = "brave", api_key: str = "", base_url: str = "") -> WebSearchTool: def _tool(
return WebSearchTool(config=WebSearchConfig(provider=provider, api_key=api_key, base_url=base_url)) provider: str = "brave",
api_key: str = "",
base_url: str = "",
user_agent: str | None = None,
) -> WebSearchTool:
return WebSearchTool(
config=WebSearchConfig(provider=provider, api_key=api_key, base_url=base_url),
user_agent=user_agent,
)
def _response(status: int = 200, json: dict | None = None) -> httpx.Response: def _response(status: int = 200, json: dict | None = None) -> httpx.Response:
@ -42,12 +50,13 @@ async def test_brave_search(monkeypatch):
async def mock_get(self, url, **kw): async def mock_get(self, url, **kw):
assert "brave" in url assert "brave" in url
assert kw["headers"]["X-Subscription-Token"] == "brave-key" assert kw["headers"]["X-Subscription-Token"] == "brave-key"
assert kw["headers"]["User-Agent"] == "nanobot-search-test"
return _response(json={ return _response(json={
"web": {"results": [{"title": "NanoBot", "url": "https://example.com", "description": "AI assistant"}]} "web": {"results": [{"title": "NanoBot", "url": "https://example.com", "description": "AI assistant"}]}
}) })
monkeypatch.setattr(httpx.AsyncClient, "get", mock_get) monkeypatch.setattr(httpx.AsyncClient, "get", mock_get)
tool = _tool(provider="brave", api_key="brave-key") tool = _tool(provider="brave", api_key="brave-key", user_agent="nanobot-search-test")
result = await tool.execute(query="nanobot", count=1) result = await tool.execute(query="nanobot", count=1)
assert "NanoBot" in result assert "NanoBot" in result
assert "https://example.com" in result assert "https://example.com" in result
@ -58,12 +67,13 @@ async def test_tavily_search(monkeypatch):
async def mock_post(self, url, **kw): async def mock_post(self, url, **kw):
assert "tavily" in url assert "tavily" in url
assert kw["headers"]["Authorization"] == "Bearer tavily-key" assert kw["headers"]["Authorization"] == "Bearer tavily-key"
assert kw["headers"]["User-Agent"] == "nanobot-search-test"
return _response(json={ return _response(json={
"results": [{"title": "OpenClaw", "url": "https://openclaw.io", "content": "Framework"}] "results": [{"title": "OpenClaw", "url": "https://openclaw.io", "content": "Framework"}]
}) })
monkeypatch.setattr(httpx.AsyncClient, "post", mock_post) monkeypatch.setattr(httpx.AsyncClient, "post", mock_post)
tool = _tool(provider="tavily", api_key="tavily-key") tool = _tool(provider="tavily", api_key="tavily-key", user_agent="nanobot-search-test")
result = await tool.execute(query="openclaw") result = await tool.execute(query="openclaw")
assert "OpenClaw" in result assert "OpenClaw" in result
assert "https://openclaw.io" in result assert "https://openclaw.io" in result
@ -73,12 +83,13 @@ async def test_tavily_search(monkeypatch):
async def test_searxng_search(monkeypatch): async def test_searxng_search(monkeypatch):
async def mock_get(self, url, **kw): async def mock_get(self, url, **kw):
assert "searx.example" in url assert "searx.example" in url
assert kw["headers"]["User-Agent"] == "nanobot-search-test"
return _response(json={ return _response(json={
"results": [{"title": "Result", "url": "https://example.com", "content": "SearXNG result"}] "results": [{"title": "Result", "url": "https://example.com", "content": "SearXNG result"}]
}) })
monkeypatch.setattr(httpx.AsyncClient, "get", mock_get) monkeypatch.setattr(httpx.AsyncClient, "get", mock_get)
tool = _tool(provider="searxng", base_url="https://searx.example") tool = _tool(provider="searxng", base_url="https://searx.example", user_agent="nanobot-search-test")
result = await tool.execute(query="test") result = await tool.execute(query="test")
assert "Result" in result assert "Result" in result
@ -125,12 +136,13 @@ async def test_jina_search(monkeypatch):
async def mock_get(self, url, **kw): async def mock_get(self, url, **kw):
assert "s.jina.ai" in str(url) assert "s.jina.ai" in str(url)
assert kw["headers"]["Authorization"] == "Bearer jina-key" assert kw["headers"]["Authorization"] == "Bearer jina-key"
assert kw["headers"]["User-Agent"] == "nanobot-search-test"
return _response(json={ return _response(json={
"data": [{"title": "Jina Result", "url": "https://jina.ai", "content": "AI search"}] "data": [{"title": "Jina Result", "url": "https://jina.ai", "content": "AI search"}]
}) })
monkeypatch.setattr(httpx.AsyncClient, "get", mock_get) monkeypatch.setattr(httpx.AsyncClient, "get", mock_get)
tool = _tool(provider="jina", api_key="jina-key") tool = _tool(provider="jina", api_key="jina-key", user_agent="nanobot-search-test")
result = await tool.execute(query="test") result = await tool.execute(query="test")
assert "Jina Result" in result assert "Jina Result" in result
assert "https://jina.ai" in result assert "https://jina.ai" in result
@ -141,6 +153,7 @@ async def test_kagi_search(monkeypatch):
async def mock_get(self, url, **kw): async def mock_get(self, url, **kw):
assert "kagi.com/api/v0/search" in url assert "kagi.com/api/v0/search" in url
assert kw["headers"]["Authorization"] == "Bot kagi-key" assert kw["headers"]["Authorization"] == "Bot kagi-key"
assert kw["headers"]["User-Agent"] == "nanobot-search-test"
assert kw["params"] == {"q": "test", "limit": 2} assert kw["params"] == {"q": "test", "limit": 2}
return _response(json={ return _response(json={
"data": [ "data": [
@ -150,7 +163,7 @@ async def test_kagi_search(monkeypatch):
}) })
monkeypatch.setattr(httpx.AsyncClient, "get", mock_get) monkeypatch.setattr(httpx.AsyncClient, "get", mock_get)
tool = _tool(provider="kagi", api_key="kagi-key") tool = _tool(provider="kagi", api_key="kagi-key", user_agent="nanobot-search-test")
result = await tool.execute(query="test", count=2) result = await tool.execute(query="test", count=2)
assert "Kagi Result" in result assert "Kagi Result" in result
assert "https://kagi.com" in result assert "https://kagi.com" in result