From 5943ab386d8a47e29eea89baf6c6098762020505 Mon Sep 17 00:00:00 2001 From: hussein1362 Date: Sat, 25 Apr 2026 17:36:58 +0300 Subject: [PATCH] fix(providers): disable HTTP keepalive for local/LAN endpoints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Local model servers (Ollama, llama.cpp, vLLM) often close idle HTTP connections before the client-side keepalive timer expires. When two LLM calls happen seconds apart — for example the heartbeat _decide() phase followed immediately by process_direct() — the second call grabs a now-dead pooled connection, causing a transient APIConnectionError on every first attempt. The fix detects local endpoints via: - ProviderSpec.is_local (Ollama, LM Studio, vLLM, OVMS) - Private-network URL patterns (localhost, 127.x, 192.168.x, 10.x, 172.16-31.x, host.docker.internal, [::1]) For these endpoints, the AsyncOpenAI client is created with a custom httpx.AsyncClient that sets keepalive_expiry=0, forcing a fresh TCP connection for each request. This is cheap on LAN (sub-5ms connect) and eliminates the stale-connection retry tax entirely. Cloud providers (OpenAI, Anthropic, OpenRouter, etc.) keep the default 5-second keepalive, which is fine for high-frequency API usage. The private-network heuristic also covers the common case where users configure provider='openai' but point apiBase at a LAN IP running llama.cpp — the spec says is_local=False, but the URL clearly is. --- nanobot/providers/openai_compat_provider.py | 50 ++++++++ .../test_local_endpoint_detection.py | 111 ++++++++++++++++++ 2 files changed, 161 insertions(+) create mode 100644 tests/providers/test_local_endpoint_detection.py diff --git a/nanobot/providers/openai_compat_provider.py b/nanobot/providers/openai_compat_provider.py index f603b9e37..c59080abc 100644 --- a/nanobot/providers/openai_compat_provider.py +++ b/nanobot/providers/openai_compat_provider.py @@ -14,6 +14,7 @@ import uuid from collections.abc import Awaitable, Callable from typing import TYPE_CHECKING, Any +import httpx import json_repair from loguru import logger @@ -159,6 +160,39 @@ _RESPONSES_FAILURE_THRESHOLD = 3 _RESPONSES_PROBE_INTERVAL_S = 300 # 5 minutes +def _is_local_endpoint( + spec: "ProviderSpec | None", + api_base: str | None, +) -> bool: + """Return True when the endpoint is a local or LAN model server. + + Matches either the provider spec's ``is_local`` flag or common private- + network patterns in the base URL (localhost, 127.x, 192.168.x, 10.x, + 172.16-31.x, Docker ``host.docker.internal``). + """ + if spec and spec.is_local: + return True + if not api_base: + return False + host = api_base.strip().lower().rstrip("/") + private_patterns = ( + "localhost", + "127.", + "192.168.", + "10.", + "host.docker.internal", + "[::1]", + ) + if any(p in host for p in private_patterns): + return True + # 172.16.0.0 – 172.31.255.255 + import re + m = re.search(r"172\.(\d+)\." , host) + if m and 16 <= int(m.group(1)) <= 31: + return True + return False + + def _is_direct_openai_base(api_base: str | None) -> bool: """Return True for direct OpenAI endpoints, not generic OpenAI-compatible gateways.""" if not api_base: @@ -208,11 +242,27 @@ class OpenAICompatProvider(LLMProvider): if extra_headers: default_headers.update(extra_headers) + # Local model servers (Ollama, llama.cpp, vLLM) often close idle + # HTTP connections before the client-side keepalive expires. When + # two LLM calls happen seconds apart (e.g. heartbeat _decide then + # process_direct), the second call may grab a now-dead pooled + # connection, causing a transient APIConnectionError on every first + # attempt. Disabling keepalive for local endpoints avoids this by + # opening a fresh connection for each request, which is cheap on a + # LAN. Cloud providers benefit from keepalive, so we leave the + # default pool settings for them. + http_client: httpx.AsyncClient | None = None + if _is_local_endpoint(spec, effective_base): + http_client = httpx.AsyncClient( + limits=httpx.Limits(keepalive_expiry=0), + ) + self._client = AsyncOpenAI( api_key=api_key or "no-key", base_url=effective_base, default_headers=default_headers, max_retries=0, + http_client=http_client, ) # Responses API circuit breaker: skip after repeated failures, diff --git a/tests/providers/test_local_endpoint_detection.py b/tests/providers/test_local_endpoint_detection.py new file mode 100644 index 000000000..2b27176be --- /dev/null +++ b/tests/providers/test_local_endpoint_detection.py @@ -0,0 +1,111 @@ +"""Tests for _is_local_endpoint detection and keepalive configuration.""" + +from unittest.mock import MagicMock + +import pytest + +from nanobot.providers.openai_compat_provider import ( + OpenAICompatProvider, + _is_local_endpoint, +) + + +def _make_spec(is_local: bool = False) -> MagicMock: + spec = MagicMock() + spec.is_local = is_local + return spec + + +class TestIsLocalEndpoint: + """Test the _is_local_endpoint helper.""" + + def test_spec_is_local_true(self): + assert _is_local_endpoint(_make_spec(is_local=True), None) is True + + def test_spec_is_local_false_no_base(self): + assert _is_local_endpoint(_make_spec(is_local=False), None) is False + + def test_no_spec_no_base(self): + assert _is_local_endpoint(None, None) is False + + def test_localhost(self): + assert _is_local_endpoint(None, "http://localhost:1234/v1") is True + + def test_localhost_https(self): + assert _is_local_endpoint(None, "https://localhost:8080/v1") is True + + def test_loopback_127(self): + assert _is_local_endpoint(None, "http://127.0.0.1:11434/v1") is True + + def test_private_192_168(self): + assert _is_local_endpoint(None, "http://192.168.8.188:1234/v1") is True + + def test_private_10(self): + assert _is_local_endpoint(None, "http://10.0.0.5:8000/v1") is True + + def test_private_172_16(self): + assert _is_local_endpoint(None, "http://172.16.0.1:1234/v1") is True + + def test_private_172_31(self): + assert _is_local_endpoint(None, "http://172.31.255.255:1234/v1") is True + + def test_not_private_172_32(self): + assert _is_local_endpoint(None, "http://172.32.0.1:1234/v1") is False + + def test_docker_internal(self): + assert _is_local_endpoint(None, "http://host.docker.internal:11434/v1") is True + + def test_ipv6_loopback(self): + assert _is_local_endpoint(None, "http://[::1]:1234/v1") is True + + def test_public_api(self): + assert _is_local_endpoint(None, "https://api.openai.com/v1") is False + + def test_openrouter(self): + assert _is_local_endpoint(None, "https://openrouter.ai/api/v1") is False + + def test_spec_overrides_public_url(self): + """spec.is_local=True takes precedence even with a public-looking URL.""" + assert _is_local_endpoint(_make_spec(is_local=True), "https://api.example.com/v1") is True + + def test_case_insensitive(self): + assert _is_local_endpoint(None, "http://LOCALHOST:1234/v1") is True + + def test_trailing_slash(self): + assert _is_local_endpoint(None, "http://192.168.1.1:8080/v1/") is True + + +class TestLocalKeepaliveConfig: + """Verify that local endpoints get keepalive_expiry=0.""" + + def test_local_spec_disables_keepalive(self): + spec = _make_spec(is_local=True) + spec.env_key = "" + spec.default_api_base = "http://localhost:11434/v1" + provider = OpenAICompatProvider( + api_key="test", api_base="http://localhost:11434/v1", spec=spec, + ) + pool = provider._client._client._transport._pool + assert pool._keepalive_expiry == 0 + + def test_lan_ip_disables_keepalive(self): + """A generic 'openai' spec with a LAN IP should still disable keepalive.""" + spec = _make_spec(is_local=False) + spec.env_key = "" + spec.default_api_base = None + provider = OpenAICompatProvider( + api_key="test", api_base="http://192.168.8.188:1234/v1", spec=spec, + ) + pool = provider._client._client._transport._pool + assert pool._keepalive_expiry == 0 + + def test_cloud_keeps_default_keepalive(self): + spec = _make_spec(is_local=False) + spec.env_key = "" + spec.default_api_base = "https://api.openai.com/v1" + provider = OpenAICompatProvider( + api_key="test", api_base=None, spec=spec, + ) + pool = provider._client._client._transport._pool + # Default httpx keepalive is 5.0s + assert pool._keepalive_expiry == 5.0