fix(goal): re-evaluate LLM timeout per request when goal state changes mid-run

The llm_timeout_s was computed once when creating AgentRunSpec, so if long_task was called during a turn, subsequent LLM requests still used the original timeout (default 300s). Now it's a callable that re-reads session metadata on each _call_llm invocation.
2026-05-19 16:12:30 +00:00 · 2026-05-16 20:20:37 +08:00 · 2026-05-16 20:20:37 +08:00 · dfabd597f3
commit dfabd597f3
parent 387724c355
4 changed files with 11 additions and 13 deletions
--- a/nanobot/agent/loop.py
+++ b/nanobot/agent/loop.py
@ -794,9 +794,8 @@ class AgentLoop:
                retry_wait_callback=on_retry_wait,
                checkpoint_callback=_checkpoint,
                injection_callback=_drain_pending,
-                # Sustained goals may legitimately exceed NANOBOT_LLM_TIMEOUT_S; idle stall
-                # is still capped by NANOBOT_STREAM_IDLE_TIMEOUT_S in streaming providers.
-                llm_timeout_s=runner_wall_llm_timeout_s(
+                # Re-evaluated per LLM request so mid-run goal registration takes effect.
+                llm_timeout_s=lambda: runner_wall_llm_timeout_s(
                    self.sessions,
                    session.key if session is not None else session_key,
                    metadata=(session.metadata if session is not None else None),
--- a/nanobot/agent/runner.py
+++ b/nanobot/agent/runner.py
@ -8,7 +8,7 @@ import os
 from contextlib import suppress
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any
+from typing import Any, Callable

 from loguru import logger

@ -81,7 +81,7 @@ class AgentRunSpec:
    retry_wait_callback: Any | None = None
    checkpoint_callback: Any | None = None
    injection_callback: Any | None = None
-    llm_timeout_s: float | None = None
+    llm_timeout_s: float | None | Callable[[], float | None] = None


@dataclass(slots=True)
@ -592,7 +592,7 @@ class AgentRunner:
        hook: AgentHook,
        context: AgentHookContext,
    ):
-        timeout_s: float | None = spec.llm_timeout_s
+        timeout_s: float | None = spec.llm_timeout_s() if callable(spec.llm_timeout_s) else spec.llm_timeout_s
        if timeout_s is None:
            # Default to a finite timeout to avoid per-session lock starvation when an LLM
            # request hangs indefinitely (e.g. gateway/network stall).
--- a/nanobot/agent/subagent.py
+++ b/nanobot/agent/subagent.py
@ -199,11 +199,9 @@ class SubagentManager:
            ]

            sess_key = origin.get("session_key")
-            llm_timeout = (
-                self._llm_wall_timeout_for_session(sess_key)
-                if self._llm_wall_timeout_for_session
-                else None
-            )
+
+            def _llm_timeout() -> float | None:
+                return self._llm_wall_timeout_for_session(sess_key) if self._llm_wall_timeout_for_session else None
            result = await self.runner.run(AgentRunSpec(
                initial_messages=messages,
                tools=tools,
@ -216,7 +214,7 @@ class SubagentManager:
                fail_on_tool_error=True,
                checkpoint_callback=_on_checkpoint,
                session_key=sess_key,
-                llm_timeout_s=llm_timeout,
+                llm_timeout_s=_llm_timeout,
            ))
            status.phase = "done"
            status.stop_reason = result.stop_reason
--- a/tests/agent/test_loop_goal_wall_timeout.py
+++ b/tests/agent/test_loop_goal_wall_timeout.py
@ -43,4 +43,5 @@ async def test_subagent_forwards_resolver_to_agent_run_spec(tmp_path: Path) -> N
    mgr.runner.run.assert_called_once()
    spec = mgr.runner.run.call_args[0][0]
    assert spec.session_key == "cli:direct"
-    assert spec.llm_timeout_s == 0.0
+    assert callable(spec.llm_timeout_s)
+    assert spec.llm_timeout_s() == 0.0