From 2e56fb95b6cb9a079d26aa5c5bf42f22b87acbbc Mon Sep 17 00:00:00 2001 From: chengyongru Date: Fri, 8 May 2026 10:09:12 +0800 Subject: [PATCH] fix(weixin): treat ret=-2 as rate limit with 60s backoff Reference wxclawbot-cli docs: ret=-2 is a rate limit (~7 msgs / 5 min per bot), NOT a stale session signal. Empty/missing errmsg is the normal rate-limit response; only 'unknown error' correlates with stale session per hermes-agent. Changes: - _is_stale_session_ret: only match 'unknown error', not empty errmsg - _send_text/_send_media_file: on ret=-2 wait 60s then retry once instead of retrying without context_token - Remove stale-session retry for empty errmsg (was burning quota) - Update tests to cover rate-limit backoff path --- nanobot/channels/weixin.py | 84 ++++++++++++++++++--------- tests/channels/test_weixin_channel.py | 71 ++++++++++++++++------ 2 files changed, 107 insertions(+), 48 deletions(-) diff --git a/nanobot/channels/weixin.py b/nanobot/channels/weixin.py index 4fe8c3667..4c2586b2b 100644 --- a/nanobot/channels/weixin.py +++ b/nanobot/channels/weixin.py @@ -83,31 +83,26 @@ SESSION_PAUSE_DURATION_S = 60 * 60 # iLink rate-limit / stale-session errcode RATE_LIMIT_ERRCODE = -2 +# iLink rate-limit backoff (wxclawbot-cli docs: ~7 msgs / 5 min per bot) +RATE_LIMIT_BACKOFF_S = 60 + def _is_stale_session_ret( ret: int | None, errcode: int | None, errmsg: str | None, ) -> bool: - """True when iLink returns ret=-2 / errcode=-2 that is likely a stale - context_token rather than a genuine rate limit. + """True when iLink returns ret=-2 / errcode=-2 with 'unknown error', + which historically correlates with a stale context_token. - Empirically iLink signals these two scenarios weakly: - - stale session: ret=-2, errmsg="unknown error" OR errmsg empty/None - - genuine rate limit: ret=-2 with a populated errmsg such as - "frequency limit" / "too frequently" / similar - - Treating "unknown error" and empty/None errmsg as stale-session signals - lets the caller attempt one tokenless retry. A true rate limit still - falls through to the existing retry/backoff path if the tokenless - attempt also fails. + Note: per wxclawbot-cli docs, ret=-2 is primarily a *rate limit* + (~7 msgs / 5 min per bot). We only treat the 'unknown error' variant + as stale-session because empty/missing errmsg is far more commonly + the rate-limit signal in practice. """ if ret != RATE_LIMIT_ERRCODE and errcode != RATE_LIMIT_ERRCODE: return False - msg = (errmsg or "").strip().lower() - if not msg: - return True - return msg == "unknown error" + return (errmsg or "").strip().lower() == "unknown error" # Retry constants (matching the reference plugin's monitor.ts) @@ -1176,20 +1171,16 @@ class WeixinChannel(BaseChannel): "base_info": BASE_INFO, } - data = await self._api_post("ilink/bot/sendmessage", body) + async def _do_send(_body: dict[str, Any]) -> dict: + return await self._api_post("ilink/bot/sendmessage", _body) + + data = await _do_send(body) ret = data.get("ret", 0) errcode = data.get("errcode", 0) errmsg = data.get("errmsg", "") - # The iLink sendmessage API may return ret=-2 / errcode=-2 for two - # different reasons: - # - stale context_token: errmsg is empty/None or "unknown error" - # - genuine rate limit: errmsg is populated (e.g. "frequency limit") - # Per hermes-agent#17228 / #18100, the empty/None variant is a stale - # session signal. Retry once without context_token (iLink accepts - # tokenless sends as a degraded fallback). If the tokenless attempt - # also fails, let _check_response_error raise so ChannelManager can - # retry with backoff — do NOT swallow the error. + # Stale session (errmsg == "unknown error") — retry once without token. + # This is distinct from the far more common rate-limit signal. if _is_stale_session_ret(ret, errcode, errmsg) and context_token: self.logger.warning( "WeChat send text returned stale-session signal for {} (client_id={}); " @@ -1199,7 +1190,7 @@ class WeixinChannel(BaseChannel): ) body_no_ctx = copy.deepcopy(body) body_no_ctx["msg"].pop("context_token", None) - data = await self._api_post("ilink/bot/sendmessage", body_no_ctx) + data = await _do_send(body_no_ctx) ret = data.get("ret", 0) errcode = data.get("errcode", 0) errmsg = data.get("errmsg", "") @@ -1216,6 +1207,23 @@ class WeixinChannel(BaseChannel): ) return + # Rate limit (-2) — per wxclawbot-cli docs this is ~7 msgs / 5 min. + # Wait 60 s and retry once; do NOT strip context_token (rate limit is + # per-bot, not per-token). + if (ret == RATE_LIMIT_ERRCODE or errcode == RATE_LIMIT_ERRCODE): + self.logger.warning( + "WeChat send text rate limited for {} (client_id={}); " + "waiting {}s before retry", + to_user_id, + client_id, + RATE_LIMIT_BACKOFF_S, + ) + await asyncio.sleep(RATE_LIMIT_BACKOFF_S) + data = await _do_send(body) + ret = data.get("ret", 0) + errcode = data.get("errcode", 0) + errmsg = data.get("errmsg", "") + self._check_response_error(data, "send text", body=body) self.logger.debug("WeChat text sent to {} (client_id={})", to_user_id, client_id) @@ -1362,12 +1370,15 @@ class WeixinChannel(BaseChannel): "base_info": BASE_INFO, } - data = await self._api_post("ilink/bot/sendmessage", body) + async def _do_send(_body: dict[str, Any]) -> dict: + return await self._api_post("ilink/bot/sendmessage", _body) + + data = await _do_send(body) ret = data.get("ret", 0) errcode = data.get("errcode", 0) errmsg = data.get("errmsg", "") - # Same stale-session handling as _send_text (hermes-agent#17228 / #18100). + # Same stale-session handling as _send_text. if _is_stale_session_ret(ret, errcode, errmsg) and context_token: self.logger.warning( "WeChat send media returned stale-session signal for {} (client_id={}); " @@ -1377,7 +1388,7 @@ class WeixinChannel(BaseChannel): ) body_no_ctx = copy.deepcopy(body) body_no_ctx["msg"].pop("context_token", None) - data = await self._api_post("ilink/bot/sendmessage", body_no_ctx) + data = await _do_send(body_no_ctx) ret = data.get("ret", 0) errcode = data.get("errcode", 0) errmsg = data.get("errmsg", "") @@ -1391,6 +1402,21 @@ class WeixinChannel(BaseChannel): self._save_state() return + # Rate limit (-2) — wait and retry once (see _send_text for rationale). + if (ret == RATE_LIMIT_ERRCODE or errcode == RATE_LIMIT_ERRCODE): + self.logger.warning( + "WeChat send media rate limited for {} (client_id={}); " + "waiting {}s before retry", + to_user_id, + client_id, + RATE_LIMIT_BACKOFF_S, + ) + await asyncio.sleep(RATE_LIMIT_BACKOFF_S) + data = await _do_send(body) + ret = data.get("ret", 0) + errcode = data.get("errcode", 0) + errmsg = data.get("errmsg", "") + self._check_response_error(data, "send media", body=body) diff --git a/tests/channels/test_weixin_channel.py b/tests/channels/test_weixin_channel.py index 33037d8ff..68b8825d7 100644 --- a/tests/channels/test_weixin_channel.py +++ b/tests/channels/test_weixin_channel.py @@ -1374,8 +1374,8 @@ async def test_send_text_retries_without_context_token_on_ret_minus_two() -> Non channel._api_post = AsyncMock( side_effect=[ - {"ret": -2}, # first attempt with token fails - {"ret": 0}, # retry without token succeeds + {"ret": -2, "errmsg": "unknown error"}, # stale session with token + {"ret": 0}, # retry without token succeeds ] ) @@ -1394,8 +1394,8 @@ async def test_send_text_retries_without_context_token_on_ret_minus_two() -> Non @pytest.mark.asyncio -async def test_send_text_raises_when_retry_also_fails_with_stale_session() -> None: - """If both attempts return stale-session ret=-2, raise so ChannelManager retries.""" +async def test_send_text_stale_session_retries_without_token_then_rate_limit_backoff(monkeypatch) -> None: + """Stale-session 'unknown error' triggers tokenless retry, then rate-limit backoff.""" channel, _bus = _make_channel() channel._client = object() channel._token = "token" @@ -1403,33 +1403,44 @@ async def test_send_text_raises_when_retry_also_fails_with_stale_session() -> No channel._api_post = AsyncMock( side_effect=[ - {"ret": -2}, # with token - {"ret": -2}, # without token + {"ret": -2, "errmsg": "unknown error"}, # with token + {"ret": -2, "errmsg": "unknown error"}, # without token + {"ret": -2, "errmsg": "unknown error"}, # rate-limit retry ] ) + # Speed up the 60-second backoff for testing + monkeypatch.setattr(weixin_mod, "RATE_LIMIT_BACKOFF_S", 0) + with pytest.raises(RuntimeError, match="WeChat send text error"): await channel._send_text("wx-user", "hello", "bad-token") - assert channel._api_post.await_count == 2 - # Token is NOT cleared because retry also failed + # 3 calls: original + tokenless + rate-limit retry + assert channel._api_post.await_count == 3 + # Token is NOT cleared because tokenless retry also failed assert channel._context_tokens.get("wx-user") == "bad-token" @pytest.mark.asyncio -async def test_send_text_raises_on_ret_minus_two_when_no_context_token() -> None: - """If no context_token was provided, ret=-2 stale session is raised.""" +async def test_send_text_rate_limit_with_empty_errmsg_waits_and_retries(monkeypatch) -> None: + """Empty errmsg ret=-2 is treated as rate limit: wait then retry once.""" channel, _bus = _make_channel() channel._client = object() channel._token = "token" - channel._api_post = AsyncMock(return_value={"ret": -2}) + channel._api_post = AsyncMock( + side_effect=[ + {"ret": -2}, # first attempt — rate limit + {"ret": -2}, # backoff retry — still rate limit + ] + ) + + monkeypatch.setattr(weixin_mod, "RATE_LIMIT_BACKOFF_S", 0) with pytest.raises(RuntimeError, match="WeChat send text error"): await channel._send_text("wx-user", "hello", "") - # Only one API call (no retry possible without token) - channel._api_post.assert_awaited_once() + assert channel._api_post.await_count == 2 # --------------------------------------------------------------------------- @@ -1440,13 +1451,14 @@ async def test_send_text_raises_on_ret_minus_two_when_no_context_token() -> None class TestIsStaleSessionRet: """Verify stale-session detection for iLink ret=-2 / errcode=-2 responses.""" - def test_ret_minus_2_with_empty_errmsg_is_stale(self): - assert weixin_mod._is_stale_session_ret(-2, 0, "") is True - assert weixin_mod._is_stale_session_ret(-2, 0, None) is True + def test_ret_minus_2_with_empty_errmsg_is_not_stale(self): + # Empty errmsg is the rate-limit signal per wxclawbot-cli docs. + assert weixin_mod._is_stale_session_ret(-2, 0, "") is False + assert weixin_mod._is_stale_session_ret(-2, 0, None) is False - def test_errcode_minus_2_with_empty_errmsg_is_stale(self): - assert weixin_mod._is_stale_session_ret(0, -2, "") is True - assert weixin_mod._is_stale_session_ret(0, -2, None) is True + def test_errcode_minus_2_with_empty_errmsg_is_not_stale(self): + assert weixin_mod._is_stale_session_ret(0, -2, "") is False + assert weixin_mod._is_stale_session_ret(0, -2, None) is False def test_ret_minus_2_with_unknown_error_is_stale(self): assert weixin_mod._is_stale_session_ret(-2, 0, "unknown error") is True @@ -1469,3 +1481,24 @@ class TestIsStaleSessionRet: def test_other_errors_are_not_stale(self): assert weixin_mod._is_stale_session_ret(-14, -14, "session timeout") is False assert weixin_mod._is_stale_session_ret(-100, 0, "internal error") is False + + +@pytest.mark.asyncio +async def test_send_text_rate_limit_backoff_succeeds(monkeypatch) -> None: + """If rate-limit retry succeeds after backoff, return normally.""" + channel, _bus = _make_channel() + channel._client = object() + channel._token = "token" + + channel._api_post = AsyncMock( + side_effect=[ + {"ret": -2}, # first attempt — rate limit + {"ret": 0}, # backoff retry — success + ] + ) + + monkeypatch.setattr(weixin_mod, "RATE_LIMIT_BACKOFF_S", 0) + + await channel._send_text("wx-user", "hello", "") + + assert channel._api_post.await_count == 2