mirror of
https://github.com/HKUDS/nanobot.git
synced 2026-05-04 08:45:54 +00:00
LLM-generated tool calls may wrap URLs in markdown backticks or quotes (e.g. \https://example.com\), causing urlparse to produce empty scheme and netloc, which leads to all fetch attempts failing silently. Add URL cleaning at the top of WebFetchTool.execute to strip whitespace, backticks, double quotes, and single quotes, plus an early rejection guard for non-http(s) URLs after cleaning.
140 lines
4.1 KiB
Python
140 lines
4.1 KiB
Python
"""Tests for web_fetch URL sanitization (backtick/quote stripping)."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
|
|
from nanobot.agent.tools.web import WebFetchTool, _validate_url
|
|
|
|
|
|
def _fake_resolve_public(hostname, port, family=0, type_=0):
|
|
import socket
|
|
return [(socket.AF_INET, socket.SOCK_STREAM, 0, "", ("93.184.216.34", 0))]
|
|
|
|
|
|
class FakeResponse:
|
|
status_code = 200
|
|
url = "https://example.com/page"
|
|
text = "<html><head><title>T</title></head><body><p>ok</p></body></html>"
|
|
headers = {"content-type": "text/html"}
|
|
def raise_for_status(self): pass
|
|
def json(self): return {}
|
|
|
|
|
|
class FakeStreamResponse:
|
|
headers = {"content-type": "text/html"}
|
|
url = "https://example.com/page"
|
|
async def __aenter__(self): return self
|
|
async def __aexit__(self, *a): return False
|
|
|
|
|
|
class FakeClient:
|
|
def __init__(self, *a, **kw): pass
|
|
async def __aenter__(self): return self
|
|
async def __aexit__(self, *a): return False
|
|
def stream(self, method, url, **kw):
|
|
return FakeStreamResponse()
|
|
async def get(self, url, **kw):
|
|
return FakeResponse()
|
|
|
|
|
|
def _patch_env():
|
|
return patch("nanobot.security.network.socket.getaddrinfo", _fake_resolve_public), \
|
|
patch("nanobot.agent.tools.web.httpx.AsyncClient", FakeClient)
|
|
|
|
|
|
# --- urlparse / _validate_url level tests ---
|
|
|
|
@pytest.mark.parametrize("dirty_url", [
|
|
"`https://example.com/page`",
|
|
" `https://example.com/page` ",
|
|
'"https://example.com/page"',
|
|
"'https://example.com/page'",
|
|
' "https://example.com/page" ',
|
|
])
|
|
def test_dirty_urls_fail_validation(dirty_url):
|
|
is_valid, msg = _validate_url(dirty_url)
|
|
assert not is_valid
|
|
|
|
|
|
def test_clean_url_passes_validation():
|
|
is_valid, msg = _validate_url("https://example.com/page")
|
|
assert is_valid
|
|
|
|
|
|
def test_backtick_url_produces_empty_scheme_in_urlparse():
|
|
from urllib.parse import urlparse
|
|
p = urlparse("`https://example.com/page`")
|
|
assert p.scheme == ""
|
|
assert p.netloc == ""
|
|
|
|
|
|
# --- WebFetchTool.execute integration tests ---
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_execute_strips_backticks_and_succeeds():
|
|
tool = WebFetchTool()
|
|
with _patch_env()[0], _patch_env()[1]:
|
|
result = await tool.execute(url="`https://example.com/page`")
|
|
data = json.loads(result)
|
|
assert "error" not in data, f"unexpected error: {data}"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_execute_strips_double_quotes_and_succeeds():
|
|
tool = WebFetchTool()
|
|
with _patch_env()[0], _patch_env()[1]:
|
|
result = await tool.execute(url='"https://example.com/page"')
|
|
data = json.loads(result)
|
|
assert "error" not in data, f"unexpected error: {data}"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_execute_strips_single_quotes_and_succeeds():
|
|
tool = WebFetchTool()
|
|
with _patch_env()[0], _patch_env()[1]:
|
|
result = await tool.execute(url="'https://example.com/page'")
|
|
data = json.loads(result)
|
|
assert "error" not in data, f"unexpected error: {data}"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_execute_strips_space_and_backticks():
|
|
tool = WebFetchTool()
|
|
with _patch_env()[0], _patch_env()[1]:
|
|
result = await tool.execute(url=" `https://example.com/page` ")
|
|
data = json.loads(result)
|
|
assert "error" not in data, f"unexpected error: {data}"
|
|
|
|
|
|
# --- startswith guard tests ---
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_execute_rejects_non_http_url_after_cleaning():
|
|
tool = WebFetchTool()
|
|
result = await tool.execute(url="ftp://example.com/file")
|
|
data = json.loads(result)
|
|
assert "error" in data
|
|
assert "Invalid URL" in data["error"]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_execute_rejects_garbage_after_cleaning():
|
|
tool = WebFetchTool()
|
|
result = await tool.execute(url="`not a url at all`")
|
|
data = json.loads(result)
|
|
assert "error" in data
|
|
assert "Invalid URL" in data["error"]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_execute_rejects_bare_domain_after_cleaning():
|
|
tool = WebFetchTool()
|
|
result = await tool.execute(url="`example.com/page`")
|
|
data = json.loads(result)
|
|
assert "error" in data
|
|
assert "Invalid URL" in data["error"]
|