fix(providers): disable HTTP keepalive for local/LAN endpoints

Local model servers (Ollama, llama.cpp, vLLM) often close idle HTTP connections before the client-side keepalive timer expires. When two LLM calls happen seconds apart — for example the heartbeat _decide() phase followed immediately by process_direct() — the second call grabs a now-dead pooled connection, causing a transient APIConnectionError on every first attempt. The fix detects local endpoints via: - ProviderSpec.is_local (Ollama, LM Studio, vLLM, OVMS) - Private-network URL patterns (localhost, 127.x, 192.168.x, 10.x, 172.16-31.x, host.docker.internal, [::1]) For these endpoints, the AsyncOpenAI client is created with a custom httpx.AsyncClient that sets keepalive_expiry=0, forcing a fresh TCP connection for each request. This is cheap on LAN (sub-5ms connect) and eliminates the stale-connection retry tax entirely. Cloud providers (OpenAI, Anthropic, OpenRouter, etc.) keep the default 5-second keepalive, which is fine for high-frequency API usage. The private-network heuristic also covers the common case where users configure provider='openai' but point apiBase at a LAN IP running llama.cpp — the spec says is_local=False, but the URL clearly is.
2026-05-26 11:32:25 +00:00 · 2026-04-25 17:36:58 +03:00 · 2026-04-25 17:36:58 +03:00 · 5943ab386d
commit 5943ab386d
parent d0e1b1393a
2 changed files with 161 additions and 0 deletions
--- a/nanobot/providers/openai_compat_provider.py
+++ b/nanobot/providers/openai_compat_provider.py
@ -14,6 +14,7 @@ import uuid
 from collections.abc import Awaitable, Callable
 from typing import TYPE_CHECKING, Any
 import httpx
 import json_repair
 from loguru import logger
@ -159,6 +160,39 @@ _RESPONSES_FAILURE_THRESHOLD = 3
 _RESPONSES_PROBE_INTERVAL_S = 300  # 5 minutes
 def _is_local_endpoint(
    spec: "ProviderSpec | None",
    api_base: str | None,
 ) -> bool:
    """Return True when the endpoint is a local or LAN model server.
    Matches either the provider spec's ``is_local`` flag or common private-
    network patterns in the base URL (localhost, 127.x, 192.168.x, 10.x,
    172.16-31.x, Docker ``host.docker.internal``).
    """
    if spec and spec.is_local:
        return True
    if not api_base:
        return False
    host = api_base.strip().lower().rstrip("/")
    private_patterns = (
        "localhost",
        "127.",
        "192.168.",
        "10.",
        "host.docker.internal",
        "[::1]",
    )
    if any(p in host for p in private_patterns):
        return True
    # 172.16.0.0 – 172.31.255.255
    import re
    m = re.search(r"172\.(\d+)\." , host)
    if m and 16 <= int(m.group(1)) <= 31:
        return True
    return False
 def _is_direct_openai_base(api_base: str | None) -> bool:
    """Return True for direct OpenAI endpoints, not generic OpenAI-compatible gateways."""
    if not api_base:
@ -208,11 +242,27 @@ class OpenAICompatProvider(LLMProvider):
        if extra_headers:
            default_headers.update(extra_headers)
        # Local model servers (Ollama, llama.cpp, vLLM) often close idle
        # HTTP connections before the client-side keepalive expires.  When
        # two LLM calls happen seconds apart (e.g. heartbeat _decide then
        # process_direct), the second call may grab a now-dead pooled
        # connection, causing a transient APIConnectionError on every first
        # attempt.  Disabling keepalive for local endpoints avoids this by
        # opening a fresh connection for each request, which is cheap on a
        # LAN.  Cloud providers benefit from keepalive, so we leave the
        # default pool settings for them.
        http_client: httpx.AsyncClient | None = None
        if _is_local_endpoint(spec, effective_base):
            http_client = httpx.AsyncClient(
                limits=httpx.Limits(keepalive_expiry=0),
            )
        self._client = AsyncOpenAI(
            api_key=api_key or "no-key",
            base_url=effective_base,
            default_headers=default_headers,
            max_retries=0,
            http_client=http_client,
        )
        # Responses API circuit breaker: skip after repeated failures,
--- a/tests/providers/test_local_endpoint_detection.py
+++ b/tests/providers/test_local_endpoint_detection.py
@ -0,0 +1,111 @@
 """Tests for _is_local_endpoint detection and keepalive configuration."""
 from unittest.mock import MagicMock
 import pytest
 from nanobot.providers.openai_compat_provider import (
    OpenAICompatProvider,
    _is_local_endpoint,
 )
 def _make_spec(is_local: bool = False) -> MagicMock:
    spec = MagicMock()
    spec.is_local = is_local
    return spec
 class TestIsLocalEndpoint:
    """Test the _is_local_endpoint helper."""
    def test_spec_is_local_true(self):
        assert _is_local_endpoint(_make_spec(is_local=True), None) is True
    def test_spec_is_local_false_no_base(self):
        assert _is_local_endpoint(_make_spec(is_local=False), None) is False
    def test_no_spec_no_base(self):
        assert _is_local_endpoint(None, None) is False
    def test_localhost(self):
        assert _is_local_endpoint(None, "http://localhost:1234/v1") is True
    def test_localhost_https(self):
        assert _is_local_endpoint(None, "https://localhost:8080/v1") is True
    def test_loopback_127(self):
        assert _is_local_endpoint(None, "http://127.0.0.1:11434/v1") is True
    def test_private_192_168(self):
        assert _is_local_endpoint(None, "http://192.168.8.188:1234/v1") is True
    def test_private_10(self):
        assert _is_local_endpoint(None, "http://10.0.0.5:8000/v1") is True
    def test_private_172_16(self):
        assert _is_local_endpoint(None, "http://172.16.0.1:1234/v1") is True
    def test_private_172_31(self):
        assert _is_local_endpoint(None, "http://172.31.255.255:1234/v1") is True
    def test_not_private_172_32(self):
        assert _is_local_endpoint(None, "http://172.32.0.1:1234/v1") is False
    def test_docker_internal(self):
        assert _is_local_endpoint(None, "http://host.docker.internal:11434/v1") is True
    def test_ipv6_loopback(self):
        assert _is_local_endpoint(None, "http://[::1]:1234/v1") is True
    def test_public_api(self):
        assert _is_local_endpoint(None, "https://api.openai.com/v1") is False
    def test_openrouter(self):
        assert _is_local_endpoint(None, "https://openrouter.ai/api/v1") is False
    def test_spec_overrides_public_url(self):
        """spec.is_local=True takes precedence even with a public-looking URL."""
        assert _is_local_endpoint(_make_spec(is_local=True), "https://api.example.com/v1") is True
    def test_case_insensitive(self):
        assert _is_local_endpoint(None, "http://LOCALHOST:1234/v1") is True
    def test_trailing_slash(self):
        assert _is_local_endpoint(None, "http://192.168.1.1:8080/v1/") is True
 class TestLocalKeepaliveConfig:
    """Verify that local endpoints get keepalive_expiry=0."""
    def test_local_spec_disables_keepalive(self):
        spec = _make_spec(is_local=True)
        spec.env_key = ""
        spec.default_api_base = "http://localhost:11434/v1"
        provider = OpenAICompatProvider(
            api_key="test", api_base="http://localhost:11434/v1", spec=spec,
        )
        pool = provider._client._client._transport._pool
        assert pool._keepalive_expiry == 0
    def test_lan_ip_disables_keepalive(self):
        """A generic 'openai' spec with a LAN IP should still disable keepalive."""
        spec = _make_spec(is_local=False)
        spec.env_key = ""
        spec.default_api_base = None
        provider = OpenAICompatProvider(
            api_key="test", api_base="http://192.168.8.188:1234/v1", spec=spec,
        )
        pool = provider._client._client._transport._pool
        assert pool._keepalive_expiry == 0
    def test_cloud_keeps_default_keepalive(self):
        spec = _make_spec(is_local=False)
        spec.env_key = ""
        spec.default_api_base = "https://api.openai.com/v1"
        provider = OpenAICompatProvider(
            api_key="test", api_base=None, spec=spec,
        )
        pool = provider._client._client._transport._pool
        # Default httpx keepalive is 5.0s
        assert pool._keepalive_expiry == 5.0