From 545294c62c0947da40eb5b65288aaf02b5fdf632 Mon Sep 17 00:00:00 2001 From: Xubin Ren <52506698+Re-bin@users.noreply.github.com> Date: Fri, 22 May 2026 23:02:15 +0800 Subject: [PATCH] fix(web): keep safe fetch preflight streaming --- nanobot/agent/tools/web.py | 60 ++++++++++++++++++++--- tests/tools/test_web_fetch_security.py | 68 +++++++++++++++++++++++++- 2 files changed, 119 insertions(+), 9 deletions(-) diff --git a/nanobot/agent/tools/web.py b/nanobot/agent/tools/web.py index 71b1011d1..dc87ea6a2 100644 --- a/nanobot/agent/tools/web.py +++ b/nanobot/agent/tools/web.py @@ -114,6 +114,46 @@ async def _get_with_safe_redirects( return None, f"Too many redirects: exceeded limit of {MAX_REDIRECTS}" + +async def _stream_with_safe_redirects( + client: httpx.AsyncClient, + url: str, + headers: dict[str, str] | None = None, +) -> tuple[httpx.Response | None, Any | None, str | None]: + """Open a streamed response while validating every redirect target first.""" + current_url = url + for _ in range(MAX_REDIRECTS + 1): + is_valid, error_msg = _validate_url_safe(current_url) + if not is_valid: + return None, None, f"Redirect blocked: {error_msg}" + + stream = client.stream( + "GET", + current_url, + headers=headers, + follow_redirects=False, + ) + response = await stream.__aenter__() + is_redirect = 300 <= response.status_code < 400 + if not is_redirect: + return response, stream, None + + location = response.headers.get("location") + if not location: + return response, stream, None + + next_url = urljoin(str(response.url), location) + is_valid, error_msg = _validate_url_safe(next_url) + if not is_valid: + await stream.__aexit__(None, None, None) + return None, None, f"Redirect blocked: {error_msg}" + + await stream.__aexit__(None, None, None) + current_url = next_url + + return None, None, f"Too many redirects: exceeded limit of {MAX_REDIRECTS}" + + def _format_results(query: str, items: list[dict[str, Any]], n: int) -> str: """Format provider results into shared plaintext output.""" if not items: @@ -522,7 +562,7 @@ class WebFetchTool(Tool): # Detect and fetch images directly to avoid Jina's textual image captioning try: async with httpx.AsyncClient(proxy=self.proxy, timeout=15.0) as client: - r, redirect_error = await _get_with_safe_redirects( + r, stream, redirect_error = await _stream_with_safe_redirects( client, url, headers={"User-Agent": self.user_agent}, @@ -532,11 +572,15 @@ class WebFetchTool(Tool): if r is None: return json.dumps({"error": "Fetch failed", "url": url}, ensure_ascii=False) - ctype = r.headers.get("content-type", "") - if ctype.startswith("image/"): - r.raise_for_status() - raw = r.content - return build_image_content_blocks(raw, ctype, url, f"(Image fetched from: {url})") + try: + ctype = r.headers.get("content-type", "") + if ctype.startswith("image/"): + r.raise_for_status() + raw = await r.aread() + return build_image_content_blocks(raw, ctype, url, f"(Image fetched from: {url})") + finally: + if stream is not None: + await stream.__aexit__(None, None, None) except Exception as e: logger.debug("Pre-fetch image detection failed for {}: {}", url, e) @@ -585,8 +629,6 @@ class WebFetchTool(Tool): async def _fetch_readability(self, url: str, extract_mode: str, max_chars: int) -> Any: """Local fallback using readability-lxml.""" - from readability import Document - try: async with httpx.AsyncClient( timeout=30.0, @@ -610,6 +652,8 @@ class WebFetchTool(Tool): if "application/json" in ctype: text, extractor = json.dumps(r.json(), indent=2, ensure_ascii=False), "json" elif "text/html" in ctype or r.text[:256].lower().startswith(("