test(web): cover configurable fetch behavior

Ensure custom user agents are applied to direct web requests and disabling Jina Reader forces the local readability path.

Made-with: Cursor
This commit is contained in:
Xubin Ren 2026-04-28 07:25:47 +00:00
parent 18432c313f
commit f4d8783f5e
4 changed files with 108 additions and 18 deletions

View File

@ -11,8 +11,7 @@ from typing import Any
from loguru import logger
from nanobot.agent.hook import AgentHook, AgentHookContext
from nanobot.utils.prompt_templates import render_template
from nanobot.agent.runner import AgentRunSpec, AgentRunner
from nanobot.agent.runner import AgentRunner, AgentRunSpec
from nanobot.agent.skills import BUILTIN_SKILLS_DIR
from nanobot.agent.tools.filesystem import EditFileTool, ListDirTool, ReadFileTool, WriteFileTool
from nanobot.agent.tools.registry import ToolRegistry
@ -23,6 +22,7 @@ from nanobot.bus.events import InboundMessage
from nanobot.bus.queue import MessageBus
from nanobot.config.schema import ExecToolConfig, WebToolsConfig
from nanobot.providers.base import LLMProvider
from nanobot.utils.prompt_templates import render_template
@dataclass(slots=True)

View File

@ -18,7 +18,7 @@ from nanobot.agent.tools.schema import IntegerSchema, StringSchema, tool_paramet
from nanobot.utils.helpers import build_image_content_blocks
if TYPE_CHECKING:
from nanobot.config.schema import WebSearchConfig, WebFetchConfig
from nanobot.config.schema import WebFetchConfig, WebSearchConfig
# Shared constants
_DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36"
@ -159,7 +159,11 @@ class WebSearchTool(Tool):
r = await client.get(
"https://api.search.brave.com/res/v1/web/search",
params={"q": query, "count": n},
headers={"Accept": "application/json", "X-Subscription-Token": api_key},
headers={
"Accept": "application/json",
"X-Subscription-Token": api_key,
"User-Agent": self.user_agent,
},
timeout=10.0,
)
r.raise_for_status()
@ -180,7 +184,7 @@ class WebSearchTool(Tool):
async with httpx.AsyncClient(proxy=self.proxy) as client:
r = await client.post(
"https://api.tavily.com/search",
headers={"Authorization": f"Bearer {api_key}"},
headers={"Authorization": f"Bearer {api_key}", "User-Agent": self.user_agent},
json={"query": query, "max_results": n},
timeout=15.0,
)
@ -217,7 +221,11 @@ class WebSearchTool(Tool):
logger.warning("JINA_API_KEY not set, falling back to DuckDuckGo")
return await self._search_duckduckgo(query, n)
try:
headers = {"Accept": "application/json", "Authorization": f"Bearer {api_key}"}
headers = {
"Accept": "application/json",
"Authorization": f"Bearer {api_key}",
"User-Agent": self.user_agent,
}
encoded_query = quote(query, safe="")
async with httpx.AsyncClient(proxy=self.proxy) as client:
r = await client.get(
@ -246,7 +254,7 @@ class WebSearchTool(Tool):
r = await client.get(
"https://kagi.com/api/v0/search",
params={"q": query, "limit": n},
headers={"Authorization": f"Bot {api_key}"},
headers={"Authorization": f"Bot {api_key}", "User-Agent": self.user_agent},
timeout=10.0,
)
r.raise_for_status()
@ -316,8 +324,15 @@ class WebFetchTool(Tool):
def read_only(self) -> bool:
return True
async def execute(self, url: str, extractMode: str = "markdown", maxChars: int | None = None, **kwargs: Any) -> Any:
max_chars = maxChars or self.max_chars
async def execute(
self,
url: str,
extract_mode: str = "markdown",
max_chars: int | None = None,
**kwargs: Any,
) -> Any:
extract_mode = kwargs.pop("extractMode", extract_mode)
max_chars = kwargs.pop("maxChars", max_chars) or self.max_chars
is_valid, error_msg = _validate_url_safe(url)
if not is_valid:
return json.dumps({"error": f"URL validation failed: {error_msg}", "url": url}, ensure_ascii=False)
@ -344,7 +359,7 @@ class WebFetchTool(Tool):
if self.config.use_jina_reader:
result = await self._fetch_jina(url, max_chars)
if result is None:
result = await self._fetch_readability(url, extractMode, max_chars)
result = await self._fetch_readability(url, extract_mode, max_chars)
return result
async def _fetch_jina(self, url: str, max_chars: int) -> str | None:

View File

@ -9,6 +9,7 @@ from unittest.mock import patch
import pytest
from nanobot.agent.tools.web import WebFetchTool
from nanobot.config.schema import WebFetchConfig
def _fake_resolve_private(hostname, port, family=0, type_=0):
@ -47,7 +48,6 @@ async def test_web_fetch_result_contains_untrusted_flag():
fake_html = "<html><head><title>Test</title></head><body><p>Hello world</p></body></html>"
import httpx
class FakeResponse:
status_code = 200
@ -69,6 +69,68 @@ async def test_web_fetch_result_contains_untrusted_flag():
assert "[External content" in data.get("text", "")
@pytest.mark.asyncio
async def test_web_fetch_can_skip_jina_and_use_custom_user_agent(monkeypatch):
tool = WebFetchTool(
config=WebFetchConfig(use_jina_reader=False),
user_agent="nanobot-test-agent",
)
seen_headers: list[dict] = []
async def _fail_jina(*args, **kwargs):
raise AssertionError("Jina Reader should be skipped when disabled")
class FakeStreamResponse:
headers = {"content-type": "text/html"}
url = "https://example.com/page"
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc, tb):
return False
class FakeResponse:
status_code = 200
url = "https://example.com/page"
text = "<html><head><title>Test</title></head><body><p>Hello world</p></body></html>"
headers = {"content-type": "text/html"}
def raise_for_status(self):
return None
class FakeClient:
def __init__(self, *args, **kwargs):
pass
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc, tb):
return False
def stream(self, method, url, headers=None):
seen_headers.append(headers or {})
return FakeStreamResponse()
async def get(self, url, headers=None):
seen_headers.append(headers or {})
return FakeResponse()
monkeypatch.setattr(tool, "_fetch_jina", _fail_jina)
monkeypatch.setattr("nanobot.agent.tools.web.httpx.AsyncClient", FakeClient)
with patch("nanobot.security.network.socket.getaddrinfo", _fake_resolve_public):
result = await tool.execute(url="https://example.com/page")
data = json.loads(result)
assert data["extractor"] == "readability"
assert [headers["User-Agent"] for headers in seen_headers] == [
"nanobot-test-agent",
"nanobot-test-agent",
]
@pytest.mark.asyncio
async def test_web_fetch_blocks_private_redirect_before_returning_image(monkeypatch):
tool = WebFetchTool()

View File

@ -7,8 +7,16 @@ from nanobot.agent.tools.web import WebSearchTool
from nanobot.config.schema import WebSearchConfig
def _tool(provider: str = "brave", api_key: str = "", base_url: str = "") -> WebSearchTool:
return WebSearchTool(config=WebSearchConfig(provider=provider, api_key=api_key, base_url=base_url))
def _tool(
provider: str = "brave",
api_key: str = "",
base_url: str = "",
user_agent: str | None = None,
) -> WebSearchTool:
return WebSearchTool(
config=WebSearchConfig(provider=provider, api_key=api_key, base_url=base_url),
user_agent=user_agent,
)
def _response(status: int = 200, json: dict | None = None) -> httpx.Response:
@ -42,12 +50,13 @@ async def test_brave_search(monkeypatch):
async def mock_get(self, url, **kw):
assert "brave" in url
assert kw["headers"]["X-Subscription-Token"] == "brave-key"
assert kw["headers"]["User-Agent"] == "nanobot-search-test"
return _response(json={
"web": {"results": [{"title": "NanoBot", "url": "https://example.com", "description": "AI assistant"}]}
})
monkeypatch.setattr(httpx.AsyncClient, "get", mock_get)
tool = _tool(provider="brave", api_key="brave-key")
tool = _tool(provider="brave", api_key="brave-key", user_agent="nanobot-search-test")
result = await tool.execute(query="nanobot", count=1)
assert "NanoBot" in result
assert "https://example.com" in result
@ -58,12 +67,13 @@ async def test_tavily_search(monkeypatch):
async def mock_post(self, url, **kw):
assert "tavily" in url
assert kw["headers"]["Authorization"] == "Bearer tavily-key"
assert kw["headers"]["User-Agent"] == "nanobot-search-test"
return _response(json={
"results": [{"title": "OpenClaw", "url": "https://openclaw.io", "content": "Framework"}]
})
monkeypatch.setattr(httpx.AsyncClient, "post", mock_post)
tool = _tool(provider="tavily", api_key="tavily-key")
tool = _tool(provider="tavily", api_key="tavily-key", user_agent="nanobot-search-test")
result = await tool.execute(query="openclaw")
assert "OpenClaw" in result
assert "https://openclaw.io" in result
@ -73,12 +83,13 @@ async def test_tavily_search(monkeypatch):
async def test_searxng_search(monkeypatch):
async def mock_get(self, url, **kw):
assert "searx.example" in url
assert kw["headers"]["User-Agent"] == "nanobot-search-test"
return _response(json={
"results": [{"title": "Result", "url": "https://example.com", "content": "SearXNG result"}]
})
monkeypatch.setattr(httpx.AsyncClient, "get", mock_get)
tool = _tool(provider="searxng", base_url="https://searx.example")
tool = _tool(provider="searxng", base_url="https://searx.example", user_agent="nanobot-search-test")
result = await tool.execute(query="test")
assert "Result" in result
@ -125,12 +136,13 @@ async def test_jina_search(monkeypatch):
async def mock_get(self, url, **kw):
assert "s.jina.ai" in str(url)
assert kw["headers"]["Authorization"] == "Bearer jina-key"
assert kw["headers"]["User-Agent"] == "nanobot-search-test"
return _response(json={
"data": [{"title": "Jina Result", "url": "https://jina.ai", "content": "AI search"}]
})
monkeypatch.setattr(httpx.AsyncClient, "get", mock_get)
tool = _tool(provider="jina", api_key="jina-key")
tool = _tool(provider="jina", api_key="jina-key", user_agent="nanobot-search-test")
result = await tool.execute(query="test")
assert "Jina Result" in result
assert "https://jina.ai" in result
@ -141,6 +153,7 @@ async def test_kagi_search(monkeypatch):
async def mock_get(self, url, **kw):
assert "kagi.com/api/v0/search" in url
assert kw["headers"]["Authorization"] == "Bot kagi-key"
assert kw["headers"]["User-Agent"] == "nanobot-search-test"
assert kw["params"] == {"q": "test", "limit": 2}
return _response(json={
"data": [
@ -150,7 +163,7 @@ async def test_kagi_search(monkeypatch):
})
monkeypatch.setattr(httpx.AsyncClient, "get", mock_get)
tool = _tool(provider="kagi", api_key="kagi-key")
tool = _tool(provider="kagi", api_key="kagi-key", user_agent="nanobot-search-test")
result = await tool.execute(query="test", count=2)
assert "Kagi Result" in result
assert "https://kagi.com" in result