fix(goal): re-evaluate LLM timeout per request when goal state changes mid-run

The llm_timeout_s was computed once when creating AgentRunSpec, so if
long_task was called during a turn, subsequent LLM requests still used
the original timeout (default 300s). Now it's a callable that re-reads
session metadata on each _call_llm invocation.
This commit is contained in:
chengyongru 2026-05-16 20:20:37 +08:00
parent 387724c355
commit dfabd597f3
4 changed files with 11 additions and 13 deletions

View File

@ -794,9 +794,8 @@ class AgentLoop:
retry_wait_callback=on_retry_wait, retry_wait_callback=on_retry_wait,
checkpoint_callback=_checkpoint, checkpoint_callback=_checkpoint,
injection_callback=_drain_pending, injection_callback=_drain_pending,
# Sustained goals may legitimately exceed NANOBOT_LLM_TIMEOUT_S; idle stall # Re-evaluated per LLM request so mid-run goal registration takes effect.
# is still capped by NANOBOT_STREAM_IDLE_TIMEOUT_S in streaming providers. llm_timeout_s=lambda: runner_wall_llm_timeout_s(
llm_timeout_s=runner_wall_llm_timeout_s(
self.sessions, self.sessions,
session.key if session is not None else session_key, session.key if session is not None else session_key,
metadata=(session.metadata if session is not None else None), metadata=(session.metadata if session is not None else None),

View File

@ -8,7 +8,7 @@ import os
from contextlib import suppress from contextlib import suppress
from dataclasses import dataclass, field from dataclasses import dataclass, field
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any, Callable
from loguru import logger from loguru import logger
@ -81,7 +81,7 @@ class AgentRunSpec:
retry_wait_callback: Any | None = None retry_wait_callback: Any | None = None
checkpoint_callback: Any | None = None checkpoint_callback: Any | None = None
injection_callback: Any | None = None injection_callback: Any | None = None
llm_timeout_s: float | None = None llm_timeout_s: float | None | Callable[[], float | None] = None
@dataclass(slots=True) @dataclass(slots=True)
@ -592,7 +592,7 @@ class AgentRunner:
hook: AgentHook, hook: AgentHook,
context: AgentHookContext, context: AgentHookContext,
): ):
timeout_s: float | None = spec.llm_timeout_s timeout_s: float | None = spec.llm_timeout_s() if callable(spec.llm_timeout_s) else spec.llm_timeout_s
if timeout_s is None: if timeout_s is None:
# Default to a finite timeout to avoid per-session lock starvation when an LLM # Default to a finite timeout to avoid per-session lock starvation when an LLM
# request hangs indefinitely (e.g. gateway/network stall). # request hangs indefinitely (e.g. gateway/network stall).

View File

@ -199,11 +199,9 @@ class SubagentManager:
] ]
sess_key = origin.get("session_key") sess_key = origin.get("session_key")
llm_timeout = (
self._llm_wall_timeout_for_session(sess_key) def _llm_timeout() -> float | None:
if self._llm_wall_timeout_for_session return self._llm_wall_timeout_for_session(sess_key) if self._llm_wall_timeout_for_session else None
else None
)
result = await self.runner.run(AgentRunSpec( result = await self.runner.run(AgentRunSpec(
initial_messages=messages, initial_messages=messages,
tools=tools, tools=tools,
@ -216,7 +214,7 @@ class SubagentManager:
fail_on_tool_error=True, fail_on_tool_error=True,
checkpoint_callback=_on_checkpoint, checkpoint_callback=_on_checkpoint,
session_key=sess_key, session_key=sess_key,
llm_timeout_s=llm_timeout, llm_timeout_s=_llm_timeout,
)) ))
status.phase = "done" status.phase = "done"
status.stop_reason = result.stop_reason status.stop_reason = result.stop_reason

View File

@ -43,4 +43,5 @@ async def test_subagent_forwards_resolver_to_agent_run_spec(tmp_path: Path) -> N
mgr.runner.run.assert_called_once() mgr.runner.run.assert_called_once()
spec = mgr.runner.run.call_args[0][0] spec = mgr.runner.run.call_args[0][0]
assert spec.session_key == "cli:direct" assert spec.session_key == "cli:direct"
assert spec.llm_timeout_s == 0.0 assert callable(spec.llm_timeout_s)
assert spec.llm_timeout_s() == 0.0