nanobot/tests/tools/test_web_fetch_url_sanitization.py
彭星杰 5dc96505e8 fix(web_fetch): sanitize URL to strip markdown backticks and quotes before validation
LLM-generated tool calls may wrap URLs in markdown backticks or quotes
(e.g. \https://example.com\), causing urlparse to produce empty scheme
and netloc, which leads to all fetch attempts failing silently.

Add URL cleaning at the top of WebFetchTool.execute to strip whitespace,
backticks, double quotes, and single quotes, plus an early rejection guard
for non-http(s) URLs after cleaning.
2026-05-01 19:58:19 +08:00

140 lines
4.1 KiB
Python

"""Tests for web_fetch URL sanitization (backtick/quote stripping)."""
from __future__ import annotations
import json
from unittest.mock import patch
import pytest
from nanobot.agent.tools.web import WebFetchTool, _validate_url
def _fake_resolve_public(hostname, port, family=0, type_=0):
import socket
return [(socket.AF_INET, socket.SOCK_STREAM, 0, "", ("93.184.216.34", 0))]
class FakeResponse:
status_code = 200
url = "https://example.com/page"
text = "<html><head><title>T</title></head><body><p>ok</p></body></html>"
headers = {"content-type": "text/html"}
def raise_for_status(self): pass
def json(self): return {}
class FakeStreamResponse:
headers = {"content-type": "text/html"}
url = "https://example.com/page"
async def __aenter__(self): return self
async def __aexit__(self, *a): return False
class FakeClient:
def __init__(self, *a, **kw): pass
async def __aenter__(self): return self
async def __aexit__(self, *a): return False
def stream(self, method, url, **kw):
return FakeStreamResponse()
async def get(self, url, **kw):
return FakeResponse()
def _patch_env():
return patch("nanobot.security.network.socket.getaddrinfo", _fake_resolve_public), \
patch("nanobot.agent.tools.web.httpx.AsyncClient", FakeClient)
# --- urlparse / _validate_url level tests ---
@pytest.mark.parametrize("dirty_url", [
"`https://example.com/page`",
" `https://example.com/page` ",
'"https://example.com/page"',
"'https://example.com/page'",
' "https://example.com/page" ',
])
def test_dirty_urls_fail_validation(dirty_url):
is_valid, msg = _validate_url(dirty_url)
assert not is_valid
def test_clean_url_passes_validation():
is_valid, msg = _validate_url("https://example.com/page")
assert is_valid
def test_backtick_url_produces_empty_scheme_in_urlparse():
from urllib.parse import urlparse
p = urlparse("`https://example.com/page`")
assert p.scheme == ""
assert p.netloc == ""
# --- WebFetchTool.execute integration tests ---
@pytest.mark.asyncio
async def test_execute_strips_backticks_and_succeeds():
tool = WebFetchTool()
with _patch_env()[0], _patch_env()[1]:
result = await tool.execute(url="`https://example.com/page`")
data = json.loads(result)
assert "error" not in data, f"unexpected error: {data}"
@pytest.mark.asyncio
async def test_execute_strips_double_quotes_and_succeeds():
tool = WebFetchTool()
with _patch_env()[0], _patch_env()[1]:
result = await tool.execute(url='"https://example.com/page"')
data = json.loads(result)
assert "error" not in data, f"unexpected error: {data}"
@pytest.mark.asyncio
async def test_execute_strips_single_quotes_and_succeeds():
tool = WebFetchTool()
with _patch_env()[0], _patch_env()[1]:
result = await tool.execute(url="'https://example.com/page'")
data = json.loads(result)
assert "error" not in data, f"unexpected error: {data}"
@pytest.mark.asyncio
async def test_execute_strips_space_and_backticks():
tool = WebFetchTool()
with _patch_env()[0], _patch_env()[1]:
result = await tool.execute(url=" `https://example.com/page` ")
data = json.loads(result)
assert "error" not in data, f"unexpected error: {data}"
# --- startswith guard tests ---
@pytest.mark.asyncio
async def test_execute_rejects_non_http_url_after_cleaning():
tool = WebFetchTool()
result = await tool.execute(url="ftp://example.com/file")
data = json.loads(result)
assert "error" in data
assert "Invalid URL" in data["error"]
@pytest.mark.asyncio
async def test_execute_rejects_garbage_after_cleaning():
tool = WebFetchTool()
result = await tool.execute(url="`not a url at all`")
data = json.loads(result)
assert "error" in data
assert "Invalid URL" in data["error"]
@pytest.mark.asyncio
async def test_execute_rejects_bare_domain_after_cleaning():
tool = WebFetchTool()
result = await tool.execute(url="`example.com/page`")
data = json.loads(result)
assert "error" in data
assert "Invalid URL" in data["error"]