fix(weixin): retry send without expired context_token on ret=-2

When the iLink API returns ret=-2 (parameter error), it is often caused
by an expired context_token rather than a malformed payload. After a
gateway restart, the cached token can become stale within ~90 seconds if
no new inbound message refreshes it, causing all outbound replies to fail
silently.

Changes:
- _send_text: retry once without context_token when ret=-2 and a token
  was present; if the retry succeeds, clear the expired token from cache.
- Remove leftover @staticmethod on _check_response_error so self.logger
  and the body parameter work correctly.
- Bump WEIXIN_CHANNEL_VERSION from 2.1.1 -> 2.1.7 to match the reference
  openclaw-weixin plugin.
- Add tests covering the ret=-2 retry path, failure path, and no-token
  path.

References:
- openclaw/openclaw#61174 (context_token expiry after long agent turns)
- hermes-agent#21011 (ret=-2 rate limiting / parameter error)
This commit is contained in:
chengyongru 2026-05-07 17:38:11 +08:00
parent e9f4a868a8
commit 28358980ed
2 changed files with 110 additions and 11 deletions

View File

@ -11,6 +11,7 @@ from __future__ import annotations
import asyncio
import base64
import copy
import hashlib
import json
import os
@ -54,7 +55,7 @@ MESSAGE_TYPE_BOT = 2
MESSAGE_STATE_FINISH = 2
WEIXIN_MAX_MESSAGE_LEN = 4000
WEIXIN_CHANNEL_VERSION = "2.1.1"
WEIXIN_CHANNEL_VERSION = "2.1.7"
ILINK_APP_ID = "bot"
@ -526,8 +527,7 @@ class WeixinChannel(BaseChannel):
f"WeChat session paused, {remaining_min} min remaining (errcode {ERRCODE_SESSION_EXPIRED})"
)
@staticmethod
def _check_response_error(data: dict, operation: str) -> None:
def _check_response_error(self, data: dict, operation: str, *, body: dict | None = None) -> None:
"""Check both ``ret`` and ``errcode`` like the reference TS code.
The iLink API may signal failure through either field (or both).
@ -537,10 +537,11 @@ class WeixinChannel(BaseChannel):
ret = data.get("ret", 0)
errcode = data.get("errcode", 0)
is_error = (ret is not None and ret != 0) or (errcode is not None and errcode != 0)
if is_error:
raise RuntimeError(
f"WeChat {operation} error (ret={ret}, errcode={errcode}): {data.get('errmsg', '')}"
)
if not is_error:
return
raise RuntimeError(
f"WeChat {operation} error (ret={ret}, errcode={errcode}): {data.get('errmsg', '')}"
)
async def _poll_once(self) -> None:
remaining = self._session_pause_remaining_s()
@ -1139,7 +1140,37 @@ class WeixinChannel(BaseChannel):
}
data = await self._api_post("ilink/bot/sendmessage", body)
self._check_response_error(data, "send text")
ret = data.get("ret", 0)
errcode = data.get("errcode", 0)
# If ret=-2 (parameter error / rate limit / expired token) and we sent
# with a context_token, retry once without it. The openclaw reference
# plugin can send without a token (it just warns), and issue #61174
# shows that expired context_tokens are a common cause of ret=-2.
if ret == -2 and context_token:
self.logger.warning(
"WeChat send text returned ret=-2 for {} (client_id={}); "
"retrying without context_token",
to_user_id,
client_id,
)
body_no_ctx = copy.deepcopy(body)
body_no_ctx["msg"].pop("context_token", None)
data = await self._api_post("ilink/bot/sendmessage", body_no_ctx)
ret = data.get("ret", 0)
errcode = data.get("errcode", 0)
if ret == 0 and (errcode == 0 or errcode is None):
self.logger.warning(
"WeChat send text succeeded WITHOUT context_token for {}; "
"clearing expired token from cache",
to_user_id,
)
self._context_tokens.pop(to_user_id, None)
self._save_state()
self.logger.debug("WeChat text sent to {} (client_id={})", to_user_id, client_id)
return
self._check_response_error(data, "send text", body=body)
self.logger.debug("WeChat text sent to {} (client_id={})", to_user_id, client_id)
async def _send_media_file(
@ -1286,7 +1317,7 @@ class WeixinChannel(BaseChannel):
}
data = await self._api_post("ilink/bot/sendmessage", body)
self._check_response_error(data, "send media")
self._check_response_error(data, "send media", body=body)
# ---------------------------------------------------------------------------

View File

@ -48,11 +48,11 @@ def test_make_headers_includes_route_tag_when_configured() -> None:
assert headers["Authorization"] == "Bearer token"
assert headers["SKRouteTag"] == "123"
assert headers["iLink-App-Id"] == "bot"
assert headers["iLink-App-ClientVersion"] == str((2 << 16) | (1 << 8) | 1)
assert headers["iLink-App-ClientVersion"] == str((2 << 16) | (1 << 8) | 7)
def test_channel_version_matches_reference_plugin_version() -> None:
assert WEIXIN_CHANNEL_VERSION == "2.1.1"
assert WEIXIN_CHANNEL_VERSION == "2.1.7"
def test_save_and_load_state_persists_context_tokens(tmp_path) -> None:
@ -1362,3 +1362,71 @@ async def test_poll_loop_logs_exception_and_continues_on_poll_failure(monkeypatc
assert call_count == 2
assert any("WeChat poll loop error" in m for m in logged_messages)
@pytest.mark.asyncio
async def test_send_text_retries_without_context_token_on_ret_minus_two() -> None:
"""If sendmessage returns ret=-2 with a context_token, retry without it."""
channel, _bus = _make_channel()
channel._client = object()
channel._token = "token"
channel._context_tokens["wx-user"] = "expired-token"
channel._api_post = AsyncMock(
side_effect=[
{"ret": -2}, # first attempt with token fails
{"ret": 0}, # retry without token succeeds
]
)
await channel._send_text("wx-user", "hello", "expired-token")
# Should have called API twice
assert channel._api_post.await_count == 2
# First call includes context_token
first_body = channel._api_post.await_args_list[0].args[1]
assert first_body["msg"]["context_token"] == "expired-token"
# Second call does NOT include context_token
second_body = channel._api_post.await_args_list[1].args[1]
assert "context_token" not in second_body["msg"]
# Expired token should be cleared from cache
assert "wx-user" not in channel._context_tokens
@pytest.mark.asyncio
async def test_send_text_raises_when_ret_minus_two_retry_also_fails() -> None:
"""If both attempts (with and without token) return ret=-2, raise."""
channel, _bus = _make_channel()
channel._client = object()
channel._token = "token"
channel._context_tokens["wx-user"] = "bad-token"
channel._api_post = AsyncMock(
side_effect=[
{"ret": -2}, # with token
{"ret": -2}, # without token
]
)
with pytest.raises(RuntimeError, match="ret=-2"):
await channel._send_text("wx-user", "hello", "bad-token")
assert channel._api_post.await_count == 2
# Token is NOT cleared because retry also failed
assert channel._context_tokens.get("wx-user") == "bad-token"
@pytest.mark.asyncio
async def test_send_text_does_not_retry_without_token_when_no_context_token() -> None:
"""If no context_token was provided, ret=-2 should raise immediately."""
channel, _bus = _make_channel()
channel._client = object()
channel._token = "token"
channel._api_post = AsyncMock(return_value={"ret": -2})
with pytest.raises(RuntimeError, match="ret=-2"):
await channel._send_text("wx-user", "hello", "")
# Only one API call (no retry)
channel._api_post.assert_awaited_once()