fix(matrix): stop sync loop on irrecoverable auth errors

When the Matrix homeserver returns M_UNKNOWN_TOKEN / M_FORBIDDEN /
M_UNAUTHORIZED (or soft_logout), the previous _sync_loop kept retrying
sync_forever every 2 seconds forever, spamming the homeserver and
filling logs (#1851). The auth state cannot recover by retrying, so
this is pure noise and a soft DoS on the homeserver.

- Extract `_is_fatal_auth_response()` helper
- In `_on_sync_error`, on fatal auth: set `_running=False` and call
  `stop_sync_forever()` so the loop exits cleanly
- Add exponential backoff (2s → 60s cap) to the generic exception path
  in `_sync_loop` so transient network blips also stop hammering

Closes #1851

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
coldxiangyu 2026-05-01 20:40:09 +08:00 committed by Xubin Ren
parent 539d82eadc
commit 4860a9a6c9
2 changed files with 76 additions and 5 deletions

View File

@ -590,15 +590,26 @@ class MatrixChannel(BaseChannel):
self.client.add_response_callback(self._on_join_error, JoinError)
self.client.add_response_callback(self._on_send_error, RoomSendError)
def _log_response_error(self, label: str, response: Any) -> None:
"""Log Matrix response errors — auth errors at ERROR level, rest at WARNING."""
def _is_fatal_auth_response(self, response: Any) -> bool:
code = getattr(response, "status_code", None)
is_auth = code in {"M_UNKNOWN_TOKEN", "M_FORBIDDEN", "M_UNAUTHORIZED"}
is_fatal = is_auth or getattr(response, "soft_logout", False)
return is_auth or bool(getattr(response, "soft_logout", False))
def _log_response_error(self, label: str, response: Any) -> None:
"""Log Matrix response errors — auth errors at ERROR level, rest at WARNING."""
is_fatal = self._is_fatal_auth_response(response)
(logger.error if is_fatal else logger.warning)("Matrix {} failed: {}", label, response)
async def _on_sync_error(self, response: SyncError) -> None:
self._log_response_error("sync", response)
if self._is_fatal_auth_response(response):
# Auth errors won't recover by retry; stop the sync loop instead of
# spamming the homeserver every 2s (#1851).
logger.error("Matrix authentication failed irrecoverably; stopping sync loop")
self._running = False
if self.client:
with suppress(Exception):
self.client.stop_sync_forever()
async def _on_join_error(self, response: JoinError) -> None:
self._log_response_error("join", response)
@ -640,13 +651,18 @@ class MatrixChannel(BaseChannel):
await self._set_typing(room_id, False)
async def _sync_loop(self) -> None:
backoff = 2.0
while self._running:
try:
await self.client.sync_forever(timeout=30000, full_state=True)
backoff = 2.0
except asyncio.CancelledError:
break
except Exception:
await asyncio.sleep(2)
if not self._running:
break
await asyncio.sleep(backoff)
backoff = min(backoff * 2, 60.0)
async def _on_room_invite(self, room: MatrixRoom, event: InviteEvent) -> None:
if self.is_allowed(event.sender):

View File

@ -7,7 +7,7 @@ import pytest
pytest.importorskip("nio")
pytest.importorskip("nh3")
pytest.importorskip("mistune")
from nio import RoomSendResponse
from nio import RoomSendResponse, SyncError
from nanobot.channels.matrix import _build_matrix_text_content
@ -266,6 +266,61 @@ async def test_start_disables_e2ee_when_configured(
await channel.stop()
@pytest.mark.asyncio
async def test_on_sync_error_stops_loop_on_unknown_token() -> None:
channel = MatrixChannel(_make_config(), MessageBus())
client = _FakeAsyncClient("", "", "", None)
channel.client = client
channel._running = True
await channel._on_sync_error(SyncError(message="bad", status_code="M_UNKNOWN_TOKEN"))
assert channel._running is False
assert client.stop_sync_forever_called is True
@pytest.mark.asyncio
async def test_on_sync_error_keeps_running_on_transient_error() -> None:
channel = MatrixChannel(_make_config(), MessageBus())
client = _FakeAsyncClient("", "", "", None)
channel.client = client
channel._running = True
await channel._on_sync_error(SyncError(message="oops", status_code="M_LIMIT_EXCEEDED"))
assert channel._running is True
assert client.stop_sync_forever_called is False
@pytest.mark.asyncio
async def test_sync_loop_backs_off_on_repeated_errors(monkeypatch) -> None:
channel = MatrixChannel(_make_config(), MessageBus())
sleeps: list[float] = []
async def _fake_sleep(delay: float) -> None:
sleeps.append(delay)
monkeypatch.setattr(matrix_module.asyncio, "sleep", _fake_sleep)
call_count = {"n": 0}
class _BoomClient:
async def sync_forever(self, **_kwargs) -> None:
call_count["n"] += 1
if call_count["n"] > 4:
channel._running = False
return
raise RuntimeError("boom")
channel.client = _BoomClient()
channel._running = True
await channel._sync_loop()
assert sleeps == [2.0, 4.0, 8.0, 16.0]
@pytest.mark.asyncio
async def test_stop_stops_sync_forever_before_close(monkeypatch) -> None:
channel = MatrixChannel(_make_config(device_id="DEVICE"), MessageBus())