mirror of
https://github.com/HKUDS/nanobot.git
synced 2026-05-20 16:42:25 +00:00
fix(weixin): treat ret=-2 as rate limit with 60s backoff
Reference wxclawbot-cli docs: ret=-2 is a rate limit (~7 msgs / 5 min per bot), NOT a stale session signal. Empty/missing errmsg is the normal rate-limit response; only 'unknown error' correlates with stale session per hermes-agent. Changes: - _is_stale_session_ret: only match 'unknown error', not empty errmsg - _send_text/_send_media_file: on ret=-2 wait 60s then retry once instead of retrying without context_token - Remove stale-session retry for empty errmsg (was burning quota) - Update tests to cover rate-limit backoff path
This commit is contained in:
parent
9665a0bb1a
commit
2e56fb95b6
@ -83,31 +83,26 @@ SESSION_PAUSE_DURATION_S = 60 * 60
|
|||||||
# iLink rate-limit / stale-session errcode
|
# iLink rate-limit / stale-session errcode
|
||||||
RATE_LIMIT_ERRCODE = -2
|
RATE_LIMIT_ERRCODE = -2
|
||||||
|
|
||||||
|
# iLink rate-limit backoff (wxclawbot-cli docs: ~7 msgs / 5 min per bot)
|
||||||
|
RATE_LIMIT_BACKOFF_S = 60
|
||||||
|
|
||||||
|
|
||||||
def _is_stale_session_ret(
|
def _is_stale_session_ret(
|
||||||
ret: int | None,
|
ret: int | None,
|
||||||
errcode: int | None,
|
errcode: int | None,
|
||||||
errmsg: str | None,
|
errmsg: str | None,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""True when iLink returns ret=-2 / errcode=-2 that is likely a stale
|
"""True when iLink returns ret=-2 / errcode=-2 with 'unknown error',
|
||||||
context_token rather than a genuine rate limit.
|
which historically correlates with a stale context_token.
|
||||||
|
|
||||||
Empirically iLink signals these two scenarios weakly:
|
Note: per wxclawbot-cli docs, ret=-2 is primarily a *rate limit*
|
||||||
- stale session: ret=-2, errmsg="unknown error" OR errmsg empty/None
|
(~7 msgs / 5 min per bot). We only treat the 'unknown error' variant
|
||||||
- genuine rate limit: ret=-2 with a populated errmsg such as
|
as stale-session because empty/missing errmsg is far more commonly
|
||||||
"frequency limit" / "too frequently" / similar
|
the rate-limit signal in practice.
|
||||||
|
|
||||||
Treating "unknown error" and empty/None errmsg as stale-session signals
|
|
||||||
lets the caller attempt one tokenless retry. A true rate limit still
|
|
||||||
falls through to the existing retry/backoff path if the tokenless
|
|
||||||
attempt also fails.
|
|
||||||
"""
|
"""
|
||||||
if ret != RATE_LIMIT_ERRCODE and errcode != RATE_LIMIT_ERRCODE:
|
if ret != RATE_LIMIT_ERRCODE and errcode != RATE_LIMIT_ERRCODE:
|
||||||
return False
|
return False
|
||||||
msg = (errmsg or "").strip().lower()
|
return (errmsg or "").strip().lower() == "unknown error"
|
||||||
if not msg:
|
|
||||||
return True
|
|
||||||
return msg == "unknown error"
|
|
||||||
|
|
||||||
|
|
||||||
# Retry constants (matching the reference plugin's monitor.ts)
|
# Retry constants (matching the reference plugin's monitor.ts)
|
||||||
@ -1176,20 +1171,16 @@ class WeixinChannel(BaseChannel):
|
|||||||
"base_info": BASE_INFO,
|
"base_info": BASE_INFO,
|
||||||
}
|
}
|
||||||
|
|
||||||
data = await self._api_post("ilink/bot/sendmessage", body)
|
async def _do_send(_body: dict[str, Any]) -> dict:
|
||||||
|
return await self._api_post("ilink/bot/sendmessage", _body)
|
||||||
|
|
||||||
|
data = await _do_send(body)
|
||||||
ret = data.get("ret", 0)
|
ret = data.get("ret", 0)
|
||||||
errcode = data.get("errcode", 0)
|
errcode = data.get("errcode", 0)
|
||||||
errmsg = data.get("errmsg", "")
|
errmsg = data.get("errmsg", "")
|
||||||
|
|
||||||
# The iLink sendmessage API may return ret=-2 / errcode=-2 for two
|
# Stale session (errmsg == "unknown error") — retry once without token.
|
||||||
# different reasons:
|
# This is distinct from the far more common rate-limit signal.
|
||||||
# - stale context_token: errmsg is empty/None or "unknown error"
|
|
||||||
# - genuine rate limit: errmsg is populated (e.g. "frequency limit")
|
|
||||||
# Per hermes-agent#17228 / #18100, the empty/None variant is a stale
|
|
||||||
# session signal. Retry once without context_token (iLink accepts
|
|
||||||
# tokenless sends as a degraded fallback). If the tokenless attempt
|
|
||||||
# also fails, let _check_response_error raise so ChannelManager can
|
|
||||||
# retry with backoff — do NOT swallow the error.
|
|
||||||
if _is_stale_session_ret(ret, errcode, errmsg) and context_token:
|
if _is_stale_session_ret(ret, errcode, errmsg) and context_token:
|
||||||
self.logger.warning(
|
self.logger.warning(
|
||||||
"WeChat send text returned stale-session signal for {} (client_id={}); "
|
"WeChat send text returned stale-session signal for {} (client_id={}); "
|
||||||
@ -1199,7 +1190,7 @@ class WeixinChannel(BaseChannel):
|
|||||||
)
|
)
|
||||||
body_no_ctx = copy.deepcopy(body)
|
body_no_ctx = copy.deepcopy(body)
|
||||||
body_no_ctx["msg"].pop("context_token", None)
|
body_no_ctx["msg"].pop("context_token", None)
|
||||||
data = await self._api_post("ilink/bot/sendmessage", body_no_ctx)
|
data = await _do_send(body_no_ctx)
|
||||||
ret = data.get("ret", 0)
|
ret = data.get("ret", 0)
|
||||||
errcode = data.get("errcode", 0)
|
errcode = data.get("errcode", 0)
|
||||||
errmsg = data.get("errmsg", "")
|
errmsg = data.get("errmsg", "")
|
||||||
@ -1216,6 +1207,23 @@ class WeixinChannel(BaseChannel):
|
|||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Rate limit (-2) — per wxclawbot-cli docs this is ~7 msgs / 5 min.
|
||||||
|
# Wait 60 s and retry once; do NOT strip context_token (rate limit is
|
||||||
|
# per-bot, not per-token).
|
||||||
|
if (ret == RATE_LIMIT_ERRCODE or errcode == RATE_LIMIT_ERRCODE):
|
||||||
|
self.logger.warning(
|
||||||
|
"WeChat send text rate limited for {} (client_id={}); "
|
||||||
|
"waiting {}s before retry",
|
||||||
|
to_user_id,
|
||||||
|
client_id,
|
||||||
|
RATE_LIMIT_BACKOFF_S,
|
||||||
|
)
|
||||||
|
await asyncio.sleep(RATE_LIMIT_BACKOFF_S)
|
||||||
|
data = await _do_send(body)
|
||||||
|
ret = data.get("ret", 0)
|
||||||
|
errcode = data.get("errcode", 0)
|
||||||
|
errmsg = data.get("errmsg", "")
|
||||||
|
|
||||||
self._check_response_error(data, "send text", body=body)
|
self._check_response_error(data, "send text", body=body)
|
||||||
self.logger.debug("WeChat text sent to {} (client_id={})", to_user_id, client_id)
|
self.logger.debug("WeChat text sent to {} (client_id={})", to_user_id, client_id)
|
||||||
|
|
||||||
@ -1362,12 +1370,15 @@ class WeixinChannel(BaseChannel):
|
|||||||
"base_info": BASE_INFO,
|
"base_info": BASE_INFO,
|
||||||
}
|
}
|
||||||
|
|
||||||
data = await self._api_post("ilink/bot/sendmessage", body)
|
async def _do_send(_body: dict[str, Any]) -> dict:
|
||||||
|
return await self._api_post("ilink/bot/sendmessage", _body)
|
||||||
|
|
||||||
|
data = await _do_send(body)
|
||||||
ret = data.get("ret", 0)
|
ret = data.get("ret", 0)
|
||||||
errcode = data.get("errcode", 0)
|
errcode = data.get("errcode", 0)
|
||||||
errmsg = data.get("errmsg", "")
|
errmsg = data.get("errmsg", "")
|
||||||
|
|
||||||
# Same stale-session handling as _send_text (hermes-agent#17228 / #18100).
|
# Same stale-session handling as _send_text.
|
||||||
if _is_stale_session_ret(ret, errcode, errmsg) and context_token:
|
if _is_stale_session_ret(ret, errcode, errmsg) and context_token:
|
||||||
self.logger.warning(
|
self.logger.warning(
|
||||||
"WeChat send media returned stale-session signal for {} (client_id={}); "
|
"WeChat send media returned stale-session signal for {} (client_id={}); "
|
||||||
@ -1377,7 +1388,7 @@ class WeixinChannel(BaseChannel):
|
|||||||
)
|
)
|
||||||
body_no_ctx = copy.deepcopy(body)
|
body_no_ctx = copy.deepcopy(body)
|
||||||
body_no_ctx["msg"].pop("context_token", None)
|
body_no_ctx["msg"].pop("context_token", None)
|
||||||
data = await self._api_post("ilink/bot/sendmessage", body_no_ctx)
|
data = await _do_send(body_no_ctx)
|
||||||
ret = data.get("ret", 0)
|
ret = data.get("ret", 0)
|
||||||
errcode = data.get("errcode", 0)
|
errcode = data.get("errcode", 0)
|
||||||
errmsg = data.get("errmsg", "")
|
errmsg = data.get("errmsg", "")
|
||||||
@ -1391,6 +1402,21 @@ class WeixinChannel(BaseChannel):
|
|||||||
self._save_state()
|
self._save_state()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Rate limit (-2) — wait and retry once (see _send_text for rationale).
|
||||||
|
if (ret == RATE_LIMIT_ERRCODE or errcode == RATE_LIMIT_ERRCODE):
|
||||||
|
self.logger.warning(
|
||||||
|
"WeChat send media rate limited for {} (client_id={}); "
|
||||||
|
"waiting {}s before retry",
|
||||||
|
to_user_id,
|
||||||
|
client_id,
|
||||||
|
RATE_LIMIT_BACKOFF_S,
|
||||||
|
)
|
||||||
|
await asyncio.sleep(RATE_LIMIT_BACKOFF_S)
|
||||||
|
data = await _do_send(body)
|
||||||
|
ret = data.get("ret", 0)
|
||||||
|
errcode = data.get("errcode", 0)
|
||||||
|
errmsg = data.get("errmsg", "")
|
||||||
|
|
||||||
self._check_response_error(data, "send media", body=body)
|
self._check_response_error(data, "send media", body=body)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1374,8 +1374,8 @@ async def test_send_text_retries_without_context_token_on_ret_minus_two() -> Non
|
|||||||
|
|
||||||
channel._api_post = AsyncMock(
|
channel._api_post = AsyncMock(
|
||||||
side_effect=[
|
side_effect=[
|
||||||
{"ret": -2}, # first attempt with token fails
|
{"ret": -2, "errmsg": "unknown error"}, # stale session with token
|
||||||
{"ret": 0}, # retry without token succeeds
|
{"ret": 0}, # retry without token succeeds
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1394,8 +1394,8 @@ async def test_send_text_retries_without_context_token_on_ret_minus_two() -> Non
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_send_text_raises_when_retry_also_fails_with_stale_session() -> None:
|
async def test_send_text_stale_session_retries_without_token_then_rate_limit_backoff(monkeypatch) -> None:
|
||||||
"""If both attempts return stale-session ret=-2, raise so ChannelManager retries."""
|
"""Stale-session 'unknown error' triggers tokenless retry, then rate-limit backoff."""
|
||||||
channel, _bus = _make_channel()
|
channel, _bus = _make_channel()
|
||||||
channel._client = object()
|
channel._client = object()
|
||||||
channel._token = "token"
|
channel._token = "token"
|
||||||
@ -1403,33 +1403,44 @@ async def test_send_text_raises_when_retry_also_fails_with_stale_session() -> No
|
|||||||
|
|
||||||
channel._api_post = AsyncMock(
|
channel._api_post = AsyncMock(
|
||||||
side_effect=[
|
side_effect=[
|
||||||
{"ret": -2}, # with token
|
{"ret": -2, "errmsg": "unknown error"}, # with token
|
||||||
{"ret": -2}, # without token
|
{"ret": -2, "errmsg": "unknown error"}, # without token
|
||||||
|
{"ret": -2, "errmsg": "unknown error"}, # rate-limit retry
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Speed up the 60-second backoff for testing
|
||||||
|
monkeypatch.setattr(weixin_mod, "RATE_LIMIT_BACKOFF_S", 0)
|
||||||
|
|
||||||
with pytest.raises(RuntimeError, match="WeChat send text error"):
|
with pytest.raises(RuntimeError, match="WeChat send text error"):
|
||||||
await channel._send_text("wx-user", "hello", "bad-token")
|
await channel._send_text("wx-user", "hello", "bad-token")
|
||||||
|
|
||||||
assert channel._api_post.await_count == 2
|
# 3 calls: original + tokenless + rate-limit retry
|
||||||
# Token is NOT cleared because retry also failed
|
assert channel._api_post.await_count == 3
|
||||||
|
# Token is NOT cleared because tokenless retry also failed
|
||||||
assert channel._context_tokens.get("wx-user") == "bad-token"
|
assert channel._context_tokens.get("wx-user") == "bad-token"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_send_text_raises_on_ret_minus_two_when_no_context_token() -> None:
|
async def test_send_text_rate_limit_with_empty_errmsg_waits_and_retries(monkeypatch) -> None:
|
||||||
"""If no context_token was provided, ret=-2 stale session is raised."""
|
"""Empty errmsg ret=-2 is treated as rate limit: wait then retry once."""
|
||||||
channel, _bus = _make_channel()
|
channel, _bus = _make_channel()
|
||||||
channel._client = object()
|
channel._client = object()
|
||||||
channel._token = "token"
|
channel._token = "token"
|
||||||
|
|
||||||
channel._api_post = AsyncMock(return_value={"ret": -2})
|
channel._api_post = AsyncMock(
|
||||||
|
side_effect=[
|
||||||
|
{"ret": -2}, # first attempt — rate limit
|
||||||
|
{"ret": -2}, # backoff retry — still rate limit
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
monkeypatch.setattr(weixin_mod, "RATE_LIMIT_BACKOFF_S", 0)
|
||||||
|
|
||||||
with pytest.raises(RuntimeError, match="WeChat send text error"):
|
with pytest.raises(RuntimeError, match="WeChat send text error"):
|
||||||
await channel._send_text("wx-user", "hello", "")
|
await channel._send_text("wx-user", "hello", "")
|
||||||
|
|
||||||
# Only one API call (no retry possible without token)
|
assert channel._api_post.await_count == 2
|
||||||
channel._api_post.assert_awaited_once()
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@ -1440,13 +1451,14 @@ async def test_send_text_raises_on_ret_minus_two_when_no_context_token() -> None
|
|||||||
class TestIsStaleSessionRet:
|
class TestIsStaleSessionRet:
|
||||||
"""Verify stale-session detection for iLink ret=-2 / errcode=-2 responses."""
|
"""Verify stale-session detection for iLink ret=-2 / errcode=-2 responses."""
|
||||||
|
|
||||||
def test_ret_minus_2_with_empty_errmsg_is_stale(self):
|
def test_ret_minus_2_with_empty_errmsg_is_not_stale(self):
|
||||||
assert weixin_mod._is_stale_session_ret(-2, 0, "") is True
|
# Empty errmsg is the rate-limit signal per wxclawbot-cli docs.
|
||||||
assert weixin_mod._is_stale_session_ret(-2, 0, None) is True
|
assert weixin_mod._is_stale_session_ret(-2, 0, "") is False
|
||||||
|
assert weixin_mod._is_stale_session_ret(-2, 0, None) is False
|
||||||
|
|
||||||
def test_errcode_minus_2_with_empty_errmsg_is_stale(self):
|
def test_errcode_minus_2_with_empty_errmsg_is_not_stale(self):
|
||||||
assert weixin_mod._is_stale_session_ret(0, -2, "") is True
|
assert weixin_mod._is_stale_session_ret(0, -2, "") is False
|
||||||
assert weixin_mod._is_stale_session_ret(0, -2, None) is True
|
assert weixin_mod._is_stale_session_ret(0, -2, None) is False
|
||||||
|
|
||||||
def test_ret_minus_2_with_unknown_error_is_stale(self):
|
def test_ret_minus_2_with_unknown_error_is_stale(self):
|
||||||
assert weixin_mod._is_stale_session_ret(-2, 0, "unknown error") is True
|
assert weixin_mod._is_stale_session_ret(-2, 0, "unknown error") is True
|
||||||
@ -1469,3 +1481,24 @@ class TestIsStaleSessionRet:
|
|||||||
def test_other_errors_are_not_stale(self):
|
def test_other_errors_are_not_stale(self):
|
||||||
assert weixin_mod._is_stale_session_ret(-14, -14, "session timeout") is False
|
assert weixin_mod._is_stale_session_ret(-14, -14, "session timeout") is False
|
||||||
assert weixin_mod._is_stale_session_ret(-100, 0, "internal error") is False
|
assert weixin_mod._is_stale_session_ret(-100, 0, "internal error") is False
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_send_text_rate_limit_backoff_succeeds(monkeypatch) -> None:
|
||||||
|
"""If rate-limit retry succeeds after backoff, return normally."""
|
||||||
|
channel, _bus = _make_channel()
|
||||||
|
channel._client = object()
|
||||||
|
channel._token = "token"
|
||||||
|
|
||||||
|
channel._api_post = AsyncMock(
|
||||||
|
side_effect=[
|
||||||
|
{"ret": -2}, # first attempt — rate limit
|
||||||
|
{"ret": 0}, # backoff retry — success
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
monkeypatch.setattr(weixin_mod, "RATE_LIMIT_BACKOFF_S", 0)
|
||||||
|
|
||||||
|
await channel._send_text("wx-user", "hello", "")
|
||||||
|
|
||||||
|
assert channel._api_post.await_count == 2
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user