From 28358980ed1dce04c25a1875ad451d5dc7d0e536 Mon Sep 17 00:00:00 2001 From: chengyongru Date: Thu, 7 May 2026 17:38:11 +0800 Subject: [PATCH] fix(weixin): retry send without expired context_token on ret=-2 When the iLink API returns ret=-2 (parameter error), it is often caused by an expired context_token rather than a malformed payload. After a gateway restart, the cached token can become stale within ~90 seconds if no new inbound message refreshes it, causing all outbound replies to fail silently. Changes: - _send_text: retry once without context_token when ret=-2 and a token was present; if the retry succeeds, clear the expired token from cache. - Remove leftover @staticmethod on _check_response_error so self.logger and the body parameter work correctly. - Bump WEIXIN_CHANNEL_VERSION from 2.1.1 -> 2.1.7 to match the reference openclaw-weixin plugin. - Add tests covering the ret=-2 retry path, failure path, and no-token path. References: - openclaw/openclaw#61174 (context_token expiry after long agent turns) - hermes-agent#21011 (ret=-2 rate limiting / parameter error) --- nanobot/channels/weixin.py | 49 ++++++++++++++---- tests/channels/test_weixin_channel.py | 72 ++++++++++++++++++++++++++- 2 files changed, 110 insertions(+), 11 deletions(-) diff --git a/nanobot/channels/weixin.py b/nanobot/channels/weixin.py index 9c8144a9c..77f22b30c 100644 --- a/nanobot/channels/weixin.py +++ b/nanobot/channels/weixin.py @@ -11,6 +11,7 @@ from __future__ import annotations import asyncio import base64 +import copy import hashlib import json import os @@ -54,7 +55,7 @@ MESSAGE_TYPE_BOT = 2 MESSAGE_STATE_FINISH = 2 WEIXIN_MAX_MESSAGE_LEN = 4000 -WEIXIN_CHANNEL_VERSION = "2.1.1" +WEIXIN_CHANNEL_VERSION = "2.1.7" ILINK_APP_ID = "bot" @@ -526,8 +527,7 @@ class WeixinChannel(BaseChannel): f"WeChat session paused, {remaining_min} min remaining (errcode {ERRCODE_SESSION_EXPIRED})" ) - @staticmethod - def _check_response_error(data: dict, operation: str) -> None: + def _check_response_error(self, data: dict, operation: str, *, body: dict | None = None) -> None: """Check both ``ret`` and ``errcode`` like the reference TS code. The iLink API may signal failure through either field (or both). @@ -537,10 +537,11 @@ class WeixinChannel(BaseChannel): ret = data.get("ret", 0) errcode = data.get("errcode", 0) is_error = (ret is not None and ret != 0) or (errcode is not None and errcode != 0) - if is_error: - raise RuntimeError( - f"WeChat {operation} error (ret={ret}, errcode={errcode}): {data.get('errmsg', '')}" - ) + if not is_error: + return + raise RuntimeError( + f"WeChat {operation} error (ret={ret}, errcode={errcode}): {data.get('errmsg', '')}" + ) async def _poll_once(self) -> None: remaining = self._session_pause_remaining_s() @@ -1139,7 +1140,37 @@ class WeixinChannel(BaseChannel): } data = await self._api_post("ilink/bot/sendmessage", body) - self._check_response_error(data, "send text") + ret = data.get("ret", 0) + errcode = data.get("errcode", 0) + + # If ret=-2 (parameter error / rate limit / expired token) and we sent + # with a context_token, retry once without it. The openclaw reference + # plugin can send without a token (it just warns), and issue #61174 + # shows that expired context_tokens are a common cause of ret=-2. + if ret == -2 and context_token: + self.logger.warning( + "WeChat send text returned ret=-2 for {} (client_id={}); " + "retrying without context_token", + to_user_id, + client_id, + ) + body_no_ctx = copy.deepcopy(body) + body_no_ctx["msg"].pop("context_token", None) + data = await self._api_post("ilink/bot/sendmessage", body_no_ctx) + ret = data.get("ret", 0) + errcode = data.get("errcode", 0) + if ret == 0 and (errcode == 0 or errcode is None): + self.logger.warning( + "WeChat send text succeeded WITHOUT context_token for {}; " + "clearing expired token from cache", + to_user_id, + ) + self._context_tokens.pop(to_user_id, None) + self._save_state() + self.logger.debug("WeChat text sent to {} (client_id={})", to_user_id, client_id) + return + + self._check_response_error(data, "send text", body=body) self.logger.debug("WeChat text sent to {} (client_id={})", to_user_id, client_id) async def _send_media_file( @@ -1286,7 +1317,7 @@ class WeixinChannel(BaseChannel): } data = await self._api_post("ilink/bot/sendmessage", body) - self._check_response_error(data, "send media") + self._check_response_error(data, "send media", body=body) # --------------------------------------------------------------------------- diff --git a/tests/channels/test_weixin_channel.py b/tests/channels/test_weixin_channel.py index 0722cfc7b..e5218272c 100644 --- a/tests/channels/test_weixin_channel.py +++ b/tests/channels/test_weixin_channel.py @@ -48,11 +48,11 @@ def test_make_headers_includes_route_tag_when_configured() -> None: assert headers["Authorization"] == "Bearer token" assert headers["SKRouteTag"] == "123" assert headers["iLink-App-Id"] == "bot" - assert headers["iLink-App-ClientVersion"] == str((2 << 16) | (1 << 8) | 1) + assert headers["iLink-App-ClientVersion"] == str((2 << 16) | (1 << 8) | 7) def test_channel_version_matches_reference_plugin_version() -> None: - assert WEIXIN_CHANNEL_VERSION == "2.1.1" + assert WEIXIN_CHANNEL_VERSION == "2.1.7" def test_save_and_load_state_persists_context_tokens(tmp_path) -> None: @@ -1362,3 +1362,71 @@ async def test_poll_loop_logs_exception_and_continues_on_poll_failure(monkeypatc assert call_count == 2 assert any("WeChat poll loop error" in m for m in logged_messages) + + +@pytest.mark.asyncio +async def test_send_text_retries_without_context_token_on_ret_minus_two() -> None: + """If sendmessage returns ret=-2 with a context_token, retry without it.""" + channel, _bus = _make_channel() + channel._client = object() + channel._token = "token" + channel._context_tokens["wx-user"] = "expired-token" + + channel._api_post = AsyncMock( + side_effect=[ + {"ret": -2}, # first attempt with token fails + {"ret": 0}, # retry without token succeeds + ] + ) + + await channel._send_text("wx-user", "hello", "expired-token") + + # Should have called API twice + assert channel._api_post.await_count == 2 + # First call includes context_token + first_body = channel._api_post.await_args_list[0].args[1] + assert first_body["msg"]["context_token"] == "expired-token" + # Second call does NOT include context_token + second_body = channel._api_post.await_args_list[1].args[1] + assert "context_token" not in second_body["msg"] + # Expired token should be cleared from cache + assert "wx-user" not in channel._context_tokens + + +@pytest.mark.asyncio +async def test_send_text_raises_when_ret_minus_two_retry_also_fails() -> None: + """If both attempts (with and without token) return ret=-2, raise.""" + channel, _bus = _make_channel() + channel._client = object() + channel._token = "token" + channel._context_tokens["wx-user"] = "bad-token" + + channel._api_post = AsyncMock( + side_effect=[ + {"ret": -2}, # with token + {"ret": -2}, # without token + ] + ) + + with pytest.raises(RuntimeError, match="ret=-2"): + await channel._send_text("wx-user", "hello", "bad-token") + + assert channel._api_post.await_count == 2 + # Token is NOT cleared because retry also failed + assert channel._context_tokens.get("wx-user") == "bad-token" + + +@pytest.mark.asyncio +async def test_send_text_does_not_retry_without_token_when_no_context_token() -> None: + """If no context_token was provided, ret=-2 should raise immediately.""" + channel, _bus = _make_channel() + channel._client = object() + channel._token = "token" + + channel._api_post = AsyncMock(return_value={"ret": -2}) + + with pytest.raises(RuntimeError, match="ret=-2"): + await channel._send_text("wx-user", "hello", "") + + # Only one API call (no retry) + channel._api_post.assert_awaited_once()