mirror of
https://github.com/HKUDS/nanobot.git
synced 2026-05-19 16:12:30 +00:00
fix(weixin): distinguish stale session from rate limit on ret=-2
Reference hermes-agent#17228 / #18100 / PR#18105. iLink returns ret=-2 / errcode=-2 for two different reasons: - stale context_token: errmsg is empty/None or "unknown error" - genuine rate limit: errmsg is populated (e.g. "frequency limit") Previously we swallowed all ret=-2 responses, which caused silent message drops when the context_token was stale. Changes: - Add _is_stale_session_ret() to detect empty/"unknown error" errmsg - _send_text/_send_media_file retry once without context_token on stale session signal, then raise on persistent failure so ChannelManager can retry with backoff - Remove error-swallowing behavior - Update tests to expect raises and add TestIsStaleSessionRet coverage
This commit is contained in:
parent
9fefb31344
commit
12005c20f0
@ -80,6 +80,36 @@ BASE_INFO: dict[str, str] = {"channel_version": WEIXIN_CHANNEL_VERSION}
|
|||||||
ERRCODE_SESSION_EXPIRED = -14
|
ERRCODE_SESSION_EXPIRED = -14
|
||||||
SESSION_PAUSE_DURATION_S = 60 * 60
|
SESSION_PAUSE_DURATION_S = 60 * 60
|
||||||
|
|
||||||
|
# iLink rate-limit / stale-session errcode
|
||||||
|
RATE_LIMIT_ERRCODE = -2
|
||||||
|
|
||||||
|
|
||||||
|
def _is_stale_session_ret(
|
||||||
|
ret: int | None,
|
||||||
|
errcode: int | None,
|
||||||
|
errmsg: str | None,
|
||||||
|
) -> bool:
|
||||||
|
"""True when iLink returns ret=-2 / errcode=-2 that is likely a stale
|
||||||
|
context_token rather than a genuine rate limit.
|
||||||
|
|
||||||
|
Empirically iLink signals these two scenarios weakly:
|
||||||
|
- stale session: ret=-2, errmsg="unknown error" OR errmsg empty/None
|
||||||
|
- genuine rate limit: ret=-2 with a populated errmsg such as
|
||||||
|
"frequency limit" / "too frequently" / similar
|
||||||
|
|
||||||
|
Treating "unknown error" and empty/None errmsg as stale-session signals
|
||||||
|
lets the caller attempt one tokenless retry. A true rate limit still
|
||||||
|
falls through to the existing retry/backoff path if the tokenless
|
||||||
|
attempt also fails.
|
||||||
|
"""
|
||||||
|
if ret != RATE_LIMIT_ERRCODE and errcode != RATE_LIMIT_ERRCODE:
|
||||||
|
return False
|
||||||
|
msg = (errmsg or "").strip().lower()
|
||||||
|
if not msg:
|
||||||
|
return True
|
||||||
|
return msg == "unknown error"
|
||||||
|
|
||||||
|
|
||||||
# Retry constants (matching the reference plugin's monitor.ts)
|
# Retry constants (matching the reference plugin's monitor.ts)
|
||||||
MAX_CONSECUTIVE_FAILURES = 3
|
MAX_CONSECUTIVE_FAILURES = 3
|
||||||
BACKOFF_DELAY_S = 30
|
BACKOFF_DELAY_S = 30
|
||||||
@ -1149,49 +1179,42 @@ class WeixinChannel(BaseChannel):
|
|||||||
data = await self._api_post("ilink/bot/sendmessage", body)
|
data = await self._api_post("ilink/bot/sendmessage", body)
|
||||||
ret = data.get("ret", 0)
|
ret = data.get("ret", 0)
|
||||||
errcode = data.get("errcode", 0)
|
errcode = data.get("errcode", 0)
|
||||||
|
errmsg = data.get("errmsg", "")
|
||||||
|
|
||||||
# The iLink sendmessage API frequently returns ret=-2 (parameter
|
# The iLink sendmessage API may return ret=-2 / errcode=-2 for two
|
||||||
# error / rate limit / expired token) even though HTTP status is 200.
|
# different reasons:
|
||||||
# The openclaw reference plugin ignores the JSON body for sendmessage
|
# - stale context_token: errmsg is empty/None or "unknown error"
|
||||||
# and only checks HTTP status. We retry once without context_token
|
# - genuine rate limit: errmsg is populated (e.g. "frequency limit")
|
||||||
# (a common fix for token expiry per openclaw#61174), but if that
|
# Per hermes-agent#17228 / #18100, the empty/None variant is a stale
|
||||||
# also fails we swallow ret=-2 to avoid silent message drops.
|
# session signal. Retry once without context_token (iLink accepts
|
||||||
if ret == -2:
|
# tokenless sends as a degraded fallback). If the tokenless attempt
|
||||||
if context_token:
|
# also fails, let _check_response_error raise so ChannelManager can
|
||||||
self.logger.warning(
|
# retry with backoff — do NOT swallow the error.
|
||||||
"WeChat send text returned ret=-2 for {} (client_id={}); "
|
if _is_stale_session_ret(ret, errcode, errmsg) and context_token:
|
||||||
"retrying without context_token",
|
|
||||||
to_user_id,
|
|
||||||
client_id,
|
|
||||||
)
|
|
||||||
body_no_ctx = copy.deepcopy(body)
|
|
||||||
body_no_ctx["msg"].pop("context_token", None)
|
|
||||||
data = await self._api_post("ilink/bot/sendmessage", body_no_ctx)
|
|
||||||
ret = data.get("ret", 0)
|
|
||||||
errcode = data.get("errcode", 0)
|
|
||||||
if ret == 0 and (errcode == 0 or errcode is None):
|
|
||||||
self.logger.warning(
|
|
||||||
"WeChat send text succeeded WITHOUT context_token for {}; "
|
|
||||||
"clearing expired token from cache",
|
|
||||||
to_user_id,
|
|
||||||
)
|
|
||||||
self._context_tokens.pop(to_user_id, None)
|
|
||||||
self._save_state()
|
|
||||||
self.logger.debug(
|
|
||||||
"WeChat text sent to {} (client_id={})", to_user_id, client_id
|
|
||||||
)
|
|
||||||
return
|
|
||||||
# Treat persistent ret=-2 as non-fatal (matching reference plugin).
|
|
||||||
self.logger.warning(
|
self.logger.warning(
|
||||||
"WeChat send text returned ret=-2 for {} (client_id={}); "
|
"WeChat send text returned stale-session signal for {} (client_id={}); "
|
||||||
"treating as non-fatal. Request body: {}. Response: {}",
|
"retrying without context_token",
|
||||||
to_user_id,
|
to_user_id,
|
||||||
client_id,
|
client_id,
|
||||||
json.dumps(body, ensure_ascii=False, default=str),
|
|
||||||
json.dumps(data, ensure_ascii=False, default=str),
|
|
||||||
)
|
)
|
||||||
self.logger.debug("WeChat text sent to {} (client_id={})", to_user_id, client_id)
|
body_no_ctx = copy.deepcopy(body)
|
||||||
return
|
body_no_ctx["msg"].pop("context_token", None)
|
||||||
|
data = await self._api_post("ilink/bot/sendmessage", body_no_ctx)
|
||||||
|
ret = data.get("ret", 0)
|
||||||
|
errcode = data.get("errcode", 0)
|
||||||
|
errmsg = data.get("errmsg", "")
|
||||||
|
if ret == 0 and (errcode == 0 or errcode is None):
|
||||||
|
self.logger.warning(
|
||||||
|
"WeChat send text succeeded WITHOUT context_token for {}; "
|
||||||
|
"clearing expired token from cache",
|
||||||
|
to_user_id,
|
||||||
|
)
|
||||||
|
self._context_tokens.pop(to_user_id, None)
|
||||||
|
self._save_state()
|
||||||
|
self.logger.debug(
|
||||||
|
"WeChat text sent to {} (client_id={})", to_user_id, client_id
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
self._check_response_error(data, "send text", body=body)
|
self._check_response_error(data, "send text", body=body)
|
||||||
self.logger.debug("WeChat text sent to {} (client_id={})", to_user_id, client_id)
|
self.logger.debug("WeChat text sent to {} (client_id={})", to_user_id, client_id)
|
||||||
@ -1341,14 +1364,33 @@ class WeixinChannel(BaseChannel):
|
|||||||
|
|
||||||
data = await self._api_post("ilink/bot/sendmessage", body)
|
data = await self._api_post("ilink/bot/sendmessage", body)
|
||||||
ret = data.get("ret", 0)
|
ret = data.get("ret", 0)
|
||||||
if ret == -2:
|
errcode = data.get("errcode", 0)
|
||||||
# See _send_text for rationale: openclaw ignores ret on sendmessage.
|
errmsg = data.get("errmsg", "")
|
||||||
|
|
||||||
|
# Same stale-session handling as _send_text (hermes-agent#17228 / #18100).
|
||||||
|
if _is_stale_session_ret(ret, errcode, errmsg) and context_token:
|
||||||
self.logger.warning(
|
self.logger.warning(
|
||||||
"WeChat send media returned ret=-2 for {} (client_id={}); treating as non-fatal",
|
"WeChat send media returned stale-session signal for {} (client_id={}); "
|
||||||
|
"retrying without context_token",
|
||||||
to_user_id,
|
to_user_id,
|
||||||
client_id,
|
client_id,
|
||||||
)
|
)
|
||||||
return
|
body_no_ctx = copy.deepcopy(body)
|
||||||
|
body_no_ctx["msg"].pop("context_token", None)
|
||||||
|
data = await self._api_post("ilink/bot/sendmessage", body_no_ctx)
|
||||||
|
ret = data.get("ret", 0)
|
||||||
|
errcode = data.get("errcode", 0)
|
||||||
|
errmsg = data.get("errmsg", "")
|
||||||
|
if ret == 0 and (errcode == 0 or errcode is None):
|
||||||
|
self.logger.warning(
|
||||||
|
"WeChat send media succeeded WITHOUT context_token for {}; "
|
||||||
|
"clearing expired token from cache",
|
||||||
|
to_user_id,
|
||||||
|
)
|
||||||
|
self._context_tokens.pop(to_user_id, None)
|
||||||
|
self._save_state()
|
||||||
|
return
|
||||||
|
|
||||||
self._check_response_error(data, "send media", body=body)
|
self._check_response_error(data, "send media", body=body)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1394,8 +1394,8 @@ async def test_send_text_retries_without_context_token_on_ret_minus_two() -> Non
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_send_text_swallows_ret_minus_two_when_retry_also_fails() -> None:
|
async def test_send_text_raises_when_retry_also_fails_with_stale_session() -> None:
|
||||||
"""If both attempts return ret=-2, swallow the error (matching openclaw)."""
|
"""If both attempts return stale-session ret=-2, raise so ChannelManager retries."""
|
||||||
channel, _bus = _make_channel()
|
channel, _bus = _make_channel()
|
||||||
channel._client = object()
|
channel._client = object()
|
||||||
channel._token = "token"
|
channel._token = "token"
|
||||||
@ -1408,8 +1408,8 @@ async def test_send_text_swallows_ret_minus_two_when_retry_also_fails() -> None:
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
# Should NOT raise
|
with pytest.raises(RuntimeError, match="WeChat send text error"):
|
||||||
await channel._send_text("wx-user", "hello", "bad-token")
|
await channel._send_text("wx-user", "hello", "bad-token")
|
||||||
|
|
||||||
assert channel._api_post.await_count == 2
|
assert channel._api_post.await_count == 2
|
||||||
# Token is NOT cleared because retry also failed
|
# Token is NOT cleared because retry also failed
|
||||||
@ -1417,16 +1417,55 @@ async def test_send_text_swallows_ret_minus_two_when_retry_also_fails() -> None:
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_send_text_swallows_ret_minus_two_when_no_context_token() -> None:
|
async def test_send_text_raises_on_ret_minus_two_when_no_context_token() -> None:
|
||||||
"""If no context_token was provided, ret=-2 is swallowed without retry."""
|
"""If no context_token was provided, ret=-2 stale session is raised."""
|
||||||
channel, _bus = _make_channel()
|
channel, _bus = _make_channel()
|
||||||
channel._client = object()
|
channel._client = object()
|
||||||
channel._token = "token"
|
channel._token = "token"
|
||||||
|
|
||||||
channel._api_post = AsyncMock(return_value={"ret": -2})
|
channel._api_post = AsyncMock(return_value={"ret": -2})
|
||||||
|
|
||||||
# Should NOT raise
|
with pytest.raises(RuntimeError, match="WeChat send text error"):
|
||||||
await channel._send_text("wx-user", "hello", "")
|
await channel._send_text("wx-user", "hello", "")
|
||||||
|
|
||||||
# Only one API call (no retry)
|
# Only one API call (no retry possible without token)
|
||||||
channel._api_post.assert_awaited_once()
|
channel._api_post.assert_awaited_once()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Tests for _is_stale_session_ret (hermes-agent#17228 / #18105)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestIsStaleSessionRet:
|
||||||
|
"""Verify stale-session detection for iLink ret=-2 / errcode=-2 responses."""
|
||||||
|
|
||||||
|
def test_ret_minus_2_with_empty_errmsg_is_stale(self):
|
||||||
|
assert weixin_mod._is_stale_session_ret(-2, 0, "") is True
|
||||||
|
assert weixin_mod._is_stale_session_ret(-2, 0, None) is True
|
||||||
|
|
||||||
|
def test_errcode_minus_2_with_empty_errmsg_is_stale(self):
|
||||||
|
assert weixin_mod._is_stale_session_ret(0, -2, "") is True
|
||||||
|
assert weixin_mod._is_stale_session_ret(0, -2, None) is True
|
||||||
|
|
||||||
|
def test_ret_minus_2_with_unknown_error_is_stale(self):
|
||||||
|
assert weixin_mod._is_stale_session_ret(-2, 0, "unknown error") is True
|
||||||
|
assert weixin_mod._is_stale_session_ret(-2, 0, "UNKNOWN ERROR") is True
|
||||||
|
|
||||||
|
def test_errcode_minus_2_with_unknown_error_is_stale(self):
|
||||||
|
assert weixin_mod._is_stale_session_ret(0, -2, "unknown error") is True
|
||||||
|
|
||||||
|
def test_ret_minus_2_with_frequency_limit_is_not_stale(self):
|
||||||
|
assert weixin_mod._is_stale_session_ret(-2, 0, "frequency limit") is False
|
||||||
|
assert weixin_mod._is_stale_session_ret(-2, 0, "too frequently") is False
|
||||||
|
|
||||||
|
def test_errcode_minus_2_with_frequency_limit_is_not_stale(self):
|
||||||
|
assert weixin_mod._is_stale_session_ret(0, -2, "freq limit") is False
|
||||||
|
|
||||||
|
def test_success_codes_are_not_stale(self):
|
||||||
|
assert weixin_mod._is_stale_session_ret(0, 0, "") is False
|
||||||
|
assert weixin_mod._is_stale_session_ret(0, 0, None) is False
|
||||||
|
|
||||||
|
def test_other_errors_are_not_stale(self):
|
||||||
|
assert weixin_mod._is_stale_session_ret(-14, -14, "session timeout") is False
|
||||||
|
assert weixin_mod._is_stale_session_ret(-100, 0, "internal error") is False
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user