diff --git a/nanobot/channels/weixin.py b/nanobot/channels/weixin.py index d15d09267..4fe8c3667 100644 --- a/nanobot/channels/weixin.py +++ b/nanobot/channels/weixin.py @@ -80,6 +80,36 @@ BASE_INFO: dict[str, str] = {"channel_version": WEIXIN_CHANNEL_VERSION} ERRCODE_SESSION_EXPIRED = -14 SESSION_PAUSE_DURATION_S = 60 * 60 +# iLink rate-limit / stale-session errcode +RATE_LIMIT_ERRCODE = -2 + + +def _is_stale_session_ret( + ret: int | None, + errcode: int | None, + errmsg: str | None, +) -> bool: + """True when iLink returns ret=-2 / errcode=-2 that is likely a stale + context_token rather than a genuine rate limit. + + Empirically iLink signals these two scenarios weakly: + - stale session: ret=-2, errmsg="unknown error" OR errmsg empty/None + - genuine rate limit: ret=-2 with a populated errmsg such as + "frequency limit" / "too frequently" / similar + + Treating "unknown error" and empty/None errmsg as stale-session signals + lets the caller attempt one tokenless retry. A true rate limit still + falls through to the existing retry/backoff path if the tokenless + attempt also fails. + """ + if ret != RATE_LIMIT_ERRCODE and errcode != RATE_LIMIT_ERRCODE: + return False + msg = (errmsg or "").strip().lower() + if not msg: + return True + return msg == "unknown error" + + # Retry constants (matching the reference plugin's monitor.ts) MAX_CONSECUTIVE_FAILURES = 3 BACKOFF_DELAY_S = 30 @@ -1149,49 +1179,42 @@ class WeixinChannel(BaseChannel): data = await self._api_post("ilink/bot/sendmessage", body) ret = data.get("ret", 0) errcode = data.get("errcode", 0) + errmsg = data.get("errmsg", "") - # The iLink sendmessage API frequently returns ret=-2 (parameter - # error / rate limit / expired token) even though HTTP status is 200. - # The openclaw reference plugin ignores the JSON body for sendmessage - # and only checks HTTP status. We retry once without context_token - # (a common fix for token expiry per openclaw#61174), but if that - # also fails we swallow ret=-2 to avoid silent message drops. - if ret == -2: - if context_token: - self.logger.warning( - "WeChat send text returned ret=-2 for {} (client_id={}); " - "retrying without context_token", - to_user_id, - client_id, - ) - body_no_ctx = copy.deepcopy(body) - body_no_ctx["msg"].pop("context_token", None) - data = await self._api_post("ilink/bot/sendmessage", body_no_ctx) - ret = data.get("ret", 0) - errcode = data.get("errcode", 0) - if ret == 0 and (errcode == 0 or errcode is None): - self.logger.warning( - "WeChat send text succeeded WITHOUT context_token for {}; " - "clearing expired token from cache", - to_user_id, - ) - self._context_tokens.pop(to_user_id, None) - self._save_state() - self.logger.debug( - "WeChat text sent to {} (client_id={})", to_user_id, client_id - ) - return - # Treat persistent ret=-2 as non-fatal (matching reference plugin). + # The iLink sendmessage API may return ret=-2 / errcode=-2 for two + # different reasons: + # - stale context_token: errmsg is empty/None or "unknown error" + # - genuine rate limit: errmsg is populated (e.g. "frequency limit") + # Per hermes-agent#17228 / #18100, the empty/None variant is a stale + # session signal. Retry once without context_token (iLink accepts + # tokenless sends as a degraded fallback). If the tokenless attempt + # also fails, let _check_response_error raise so ChannelManager can + # retry with backoff — do NOT swallow the error. + if _is_stale_session_ret(ret, errcode, errmsg) and context_token: self.logger.warning( - "WeChat send text returned ret=-2 for {} (client_id={}); " - "treating as non-fatal. Request body: {}. Response: {}", + "WeChat send text returned stale-session signal for {} (client_id={}); " + "retrying without context_token", to_user_id, client_id, - json.dumps(body, ensure_ascii=False, default=str), - json.dumps(data, ensure_ascii=False, default=str), ) - self.logger.debug("WeChat text sent to {} (client_id={})", to_user_id, client_id) - return + body_no_ctx = copy.deepcopy(body) + body_no_ctx["msg"].pop("context_token", None) + data = await self._api_post("ilink/bot/sendmessage", body_no_ctx) + ret = data.get("ret", 0) + errcode = data.get("errcode", 0) + errmsg = data.get("errmsg", "") + if ret == 0 and (errcode == 0 or errcode is None): + self.logger.warning( + "WeChat send text succeeded WITHOUT context_token for {}; " + "clearing expired token from cache", + to_user_id, + ) + self._context_tokens.pop(to_user_id, None) + self._save_state() + self.logger.debug( + "WeChat text sent to {} (client_id={})", to_user_id, client_id + ) + return self._check_response_error(data, "send text", body=body) self.logger.debug("WeChat text sent to {} (client_id={})", to_user_id, client_id) @@ -1341,14 +1364,33 @@ class WeixinChannel(BaseChannel): data = await self._api_post("ilink/bot/sendmessage", body) ret = data.get("ret", 0) - if ret == -2: - # See _send_text for rationale: openclaw ignores ret on sendmessage. + errcode = data.get("errcode", 0) + errmsg = data.get("errmsg", "") + + # Same stale-session handling as _send_text (hermes-agent#17228 / #18100). + if _is_stale_session_ret(ret, errcode, errmsg) and context_token: self.logger.warning( - "WeChat send media returned ret=-2 for {} (client_id={}); treating as non-fatal", + "WeChat send media returned stale-session signal for {} (client_id={}); " + "retrying without context_token", to_user_id, client_id, ) - return + body_no_ctx = copy.deepcopy(body) + body_no_ctx["msg"].pop("context_token", None) + data = await self._api_post("ilink/bot/sendmessage", body_no_ctx) + ret = data.get("ret", 0) + errcode = data.get("errcode", 0) + errmsg = data.get("errmsg", "") + if ret == 0 and (errcode == 0 or errcode is None): + self.logger.warning( + "WeChat send media succeeded WITHOUT context_token for {}; " + "clearing expired token from cache", + to_user_id, + ) + self._context_tokens.pop(to_user_id, None) + self._save_state() + return + self._check_response_error(data, "send media", body=body) diff --git a/tests/channels/test_weixin_channel.py b/tests/channels/test_weixin_channel.py index d903d7d18..33037d8ff 100644 --- a/tests/channels/test_weixin_channel.py +++ b/tests/channels/test_weixin_channel.py @@ -1394,8 +1394,8 @@ async def test_send_text_retries_without_context_token_on_ret_minus_two() -> Non @pytest.mark.asyncio -async def test_send_text_swallows_ret_minus_two_when_retry_also_fails() -> None: - """If both attempts return ret=-2, swallow the error (matching openclaw).""" +async def test_send_text_raises_when_retry_also_fails_with_stale_session() -> None: + """If both attempts return stale-session ret=-2, raise so ChannelManager retries.""" channel, _bus = _make_channel() channel._client = object() channel._token = "token" @@ -1408,8 +1408,8 @@ async def test_send_text_swallows_ret_minus_two_when_retry_also_fails() -> None: ] ) - # Should NOT raise - await channel._send_text("wx-user", "hello", "bad-token") + with pytest.raises(RuntimeError, match="WeChat send text error"): + await channel._send_text("wx-user", "hello", "bad-token") assert channel._api_post.await_count == 2 # Token is NOT cleared because retry also failed @@ -1417,16 +1417,55 @@ async def test_send_text_swallows_ret_minus_two_when_retry_also_fails() -> None: @pytest.mark.asyncio -async def test_send_text_swallows_ret_minus_two_when_no_context_token() -> None: - """If no context_token was provided, ret=-2 is swallowed without retry.""" +async def test_send_text_raises_on_ret_minus_two_when_no_context_token() -> None: + """If no context_token was provided, ret=-2 stale session is raised.""" channel, _bus = _make_channel() channel._client = object() channel._token = "token" channel._api_post = AsyncMock(return_value={"ret": -2}) - # Should NOT raise - await channel._send_text("wx-user", "hello", "") + with pytest.raises(RuntimeError, match="WeChat send text error"): + await channel._send_text("wx-user", "hello", "") - # Only one API call (no retry) + # Only one API call (no retry possible without token) channel._api_post.assert_awaited_once() + + +# --------------------------------------------------------------------------- +# Tests for _is_stale_session_ret (hermes-agent#17228 / #18105) +# --------------------------------------------------------------------------- + + +class TestIsStaleSessionRet: + """Verify stale-session detection for iLink ret=-2 / errcode=-2 responses.""" + + def test_ret_minus_2_with_empty_errmsg_is_stale(self): + assert weixin_mod._is_stale_session_ret(-2, 0, "") is True + assert weixin_mod._is_stale_session_ret(-2, 0, None) is True + + def test_errcode_minus_2_with_empty_errmsg_is_stale(self): + assert weixin_mod._is_stale_session_ret(0, -2, "") is True + assert weixin_mod._is_stale_session_ret(0, -2, None) is True + + def test_ret_minus_2_with_unknown_error_is_stale(self): + assert weixin_mod._is_stale_session_ret(-2, 0, "unknown error") is True + assert weixin_mod._is_stale_session_ret(-2, 0, "UNKNOWN ERROR") is True + + def test_errcode_minus_2_with_unknown_error_is_stale(self): + assert weixin_mod._is_stale_session_ret(0, -2, "unknown error") is True + + def test_ret_minus_2_with_frequency_limit_is_not_stale(self): + assert weixin_mod._is_stale_session_ret(-2, 0, "frequency limit") is False + assert weixin_mod._is_stale_session_ret(-2, 0, "too frequently") is False + + def test_errcode_minus_2_with_frequency_limit_is_not_stale(self): + assert weixin_mod._is_stale_session_ret(0, -2, "freq limit") is False + + def test_success_codes_are_not_stale(self): + assert weixin_mod._is_stale_session_ret(0, 0, "") is False + assert weixin_mod._is_stale_session_ret(0, 0, None) is False + + def test_other_errors_are_not_stale(self): + assert weixin_mod._is_stale_session_ret(-14, -14, "session timeout") is False + assert weixin_mod._is_stale_session_ret(-100, 0, "internal error") is False