fix(goal): re-evaluate LLM timeout per request when goal state changes mid-run

The llm_timeout_s was computed once when creating AgentRunSpec, so if
long_task was called during a turn, subsequent LLM requests still used
the original timeout (default 300s). Now it's a callable that re-reads
session metadata on each _call_llm invocation.
This commit is contained in:
chengyongru 2026-05-16 20:20:37 +08:00
parent 387724c355
commit dfabd597f3
4 changed files with 11 additions and 13 deletions

View File

@ -794,9 +794,8 @@ class AgentLoop:
retry_wait_callback=on_retry_wait,
checkpoint_callback=_checkpoint,
injection_callback=_drain_pending,
# Sustained goals may legitimately exceed NANOBOT_LLM_TIMEOUT_S; idle stall
# is still capped by NANOBOT_STREAM_IDLE_TIMEOUT_S in streaming providers.
llm_timeout_s=runner_wall_llm_timeout_s(
# Re-evaluated per LLM request so mid-run goal registration takes effect.
llm_timeout_s=lambda: runner_wall_llm_timeout_s(
self.sessions,
session.key if session is not None else session_key,
metadata=(session.metadata if session is not None else None),

View File

@ -8,7 +8,7 @@ import os
from contextlib import suppress
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
from typing import Any, Callable
from loguru import logger
@ -81,7 +81,7 @@ class AgentRunSpec:
retry_wait_callback: Any | None = None
checkpoint_callback: Any | None = None
injection_callback: Any | None = None
llm_timeout_s: float | None = None
llm_timeout_s: float | None | Callable[[], float | None] = None
@dataclass(slots=True)
@ -592,7 +592,7 @@ class AgentRunner:
hook: AgentHook,
context: AgentHookContext,
):
timeout_s: float | None = spec.llm_timeout_s
timeout_s: float | None = spec.llm_timeout_s() if callable(spec.llm_timeout_s) else spec.llm_timeout_s
if timeout_s is None:
# Default to a finite timeout to avoid per-session lock starvation when an LLM
# request hangs indefinitely (e.g. gateway/network stall).

View File

@ -199,11 +199,9 @@ class SubagentManager:
]
sess_key = origin.get("session_key")
llm_timeout = (
self._llm_wall_timeout_for_session(sess_key)
if self._llm_wall_timeout_for_session
else None
)
def _llm_timeout() -> float | None:
return self._llm_wall_timeout_for_session(sess_key) if self._llm_wall_timeout_for_session else None
result = await self.runner.run(AgentRunSpec(
initial_messages=messages,
tools=tools,
@ -216,7 +214,7 @@ class SubagentManager:
fail_on_tool_error=True,
checkpoint_callback=_on_checkpoint,
session_key=sess_key,
llm_timeout_s=llm_timeout,
llm_timeout_s=_llm_timeout,
))
status.phase = "done"
status.stop_reason = result.stop_reason

View File

@ -43,4 +43,5 @@ async def test_subagent_forwards_resolver_to_agent_run_spec(tmp_path: Path) -> N
mgr.runner.run.assert_called_once()
spec = mgr.runner.run.call_args[0][0]
assert spec.session_key == "cli:direct"
assert spec.llm_timeout_s == 0.0
assert callable(spec.llm_timeout_s)
assert spec.llm_timeout_s() == 0.0