From 4860a9a6c9c4c2ba83e9f7e11c48007dc55a474b Mon Sep 17 00:00:00 2001 From: coldxiangyu Date: Fri, 1 May 2026 20:40:09 +0800 Subject: [PATCH] fix(matrix): stop sync loop on irrecoverable auth errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the Matrix homeserver returns M_UNKNOWN_TOKEN / M_FORBIDDEN / M_UNAUTHORIZED (or soft_logout), the previous _sync_loop kept retrying sync_forever every 2 seconds forever, spamming the homeserver and filling logs (#1851). The auth state cannot recover by retrying, so this is pure noise and a soft DoS on the homeserver. - Extract `_is_fatal_auth_response()` helper - In `_on_sync_error`, on fatal auth: set `_running=False` and call `stop_sync_forever()` so the loop exits cleanly - Add exponential backoff (2s → 60s cap) to the generic exception path in `_sync_loop` so transient network blips also stop hammering Closes #1851 Co-Authored-By: Claude Opus 4.7 (1M context) --- nanobot/channels/matrix.py | 24 +++++++++-- tests/channels/test_matrix_channel.py | 57 ++++++++++++++++++++++++++- 2 files changed, 76 insertions(+), 5 deletions(-) diff --git a/nanobot/channels/matrix.py b/nanobot/channels/matrix.py index b53ae8607..0d1989b03 100644 --- a/nanobot/channels/matrix.py +++ b/nanobot/channels/matrix.py @@ -590,15 +590,26 @@ class MatrixChannel(BaseChannel): self.client.add_response_callback(self._on_join_error, JoinError) self.client.add_response_callback(self._on_send_error, RoomSendError) - def _log_response_error(self, label: str, response: Any) -> None: - """Log Matrix response errors — auth errors at ERROR level, rest at WARNING.""" + def _is_fatal_auth_response(self, response: Any) -> bool: code = getattr(response, "status_code", None) is_auth = code in {"M_UNKNOWN_TOKEN", "M_FORBIDDEN", "M_UNAUTHORIZED"} - is_fatal = is_auth or getattr(response, "soft_logout", False) + return is_auth or bool(getattr(response, "soft_logout", False)) + + def _log_response_error(self, label: str, response: Any) -> None: + """Log Matrix response errors — auth errors at ERROR level, rest at WARNING.""" + is_fatal = self._is_fatal_auth_response(response) (logger.error if is_fatal else logger.warning)("Matrix {} failed: {}", label, response) async def _on_sync_error(self, response: SyncError) -> None: self._log_response_error("sync", response) + if self._is_fatal_auth_response(response): + # Auth errors won't recover by retry; stop the sync loop instead of + # spamming the homeserver every 2s (#1851). + logger.error("Matrix authentication failed irrecoverably; stopping sync loop") + self._running = False + if self.client: + with suppress(Exception): + self.client.stop_sync_forever() async def _on_join_error(self, response: JoinError) -> None: self._log_response_error("join", response) @@ -640,13 +651,18 @@ class MatrixChannel(BaseChannel): await self._set_typing(room_id, False) async def _sync_loop(self) -> None: + backoff = 2.0 while self._running: try: await self.client.sync_forever(timeout=30000, full_state=True) + backoff = 2.0 except asyncio.CancelledError: break except Exception: - await asyncio.sleep(2) + if not self._running: + break + await asyncio.sleep(backoff) + backoff = min(backoff * 2, 60.0) async def _on_room_invite(self, room: MatrixRoom, event: InviteEvent) -> None: if self.is_allowed(event.sender): diff --git a/tests/channels/test_matrix_channel.py b/tests/channels/test_matrix_channel.py index 8fbe2c119..8bd9f8154 100644 --- a/tests/channels/test_matrix_channel.py +++ b/tests/channels/test_matrix_channel.py @@ -7,7 +7,7 @@ import pytest pytest.importorskip("nio") pytest.importorskip("nh3") pytest.importorskip("mistune") -from nio import RoomSendResponse +from nio import RoomSendResponse, SyncError from nanobot.channels.matrix import _build_matrix_text_content @@ -266,6 +266,61 @@ async def test_start_disables_e2ee_when_configured( await channel.stop() +@pytest.mark.asyncio +async def test_on_sync_error_stops_loop_on_unknown_token() -> None: + channel = MatrixChannel(_make_config(), MessageBus()) + client = _FakeAsyncClient("", "", "", None) + channel.client = client + channel._running = True + + await channel._on_sync_error(SyncError(message="bad", status_code="M_UNKNOWN_TOKEN")) + + assert channel._running is False + assert client.stop_sync_forever_called is True + + +@pytest.mark.asyncio +async def test_on_sync_error_keeps_running_on_transient_error() -> None: + channel = MatrixChannel(_make_config(), MessageBus()) + client = _FakeAsyncClient("", "", "", None) + channel.client = client + channel._running = True + + await channel._on_sync_error(SyncError(message="oops", status_code="M_LIMIT_EXCEEDED")) + + assert channel._running is True + assert client.stop_sync_forever_called is False + + +@pytest.mark.asyncio +async def test_sync_loop_backs_off_on_repeated_errors(monkeypatch) -> None: + channel = MatrixChannel(_make_config(), MessageBus()) + + sleeps: list[float] = [] + + async def _fake_sleep(delay: float) -> None: + sleeps.append(delay) + + monkeypatch.setattr(matrix_module.asyncio, "sleep", _fake_sleep) + + call_count = {"n": 0} + + class _BoomClient: + async def sync_forever(self, **_kwargs) -> None: + call_count["n"] += 1 + if call_count["n"] > 4: + channel._running = False + return + raise RuntimeError("boom") + + channel.client = _BoomClient() + channel._running = True + + await channel._sync_loop() + + assert sleeps == [2.0, 4.0, 8.0, 16.0] + + @pytest.mark.asyncio async def test_stop_stops_sync_forever_before_close(monkeypatch) -> None: channel = MatrixChannel(_make_config(device_id="DEVICE"), MessageBus())