Skip to content

Commit 0c8594c

Browse files
committed
Fix performance regression related to delayed events processing (#18926)
1 parent fcffd2e commit 0c8594c

File tree

10 files changed

+244
-19
lines changed

10 files changed

+244
-19
lines changed

changelog.d/18926.bugfix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix a performance regression related to the experimental Delayed Events ([MSC4140](https://github.com/matrix-org/matrix-spec-proposals/pull/4140)) feature.

synapse/handlers/delayed_events.py

Lines changed: 96 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from twisted.internet.interfaces import IDelayedCall
1919

2020
from synapse.api.constants import EventTypes
21-
from synapse.api.errors import ShadowBanError
21+
from synapse.api.errors import ShadowBanError, SynapseError
2222
from synapse.api.ratelimiting import Ratelimiter
2323
from synapse.config.workers import MAIN_PROCESS_INSTANCE_NAME
2424
from synapse.logging.opentracing import set_tag
@@ -45,6 +45,7 @@
4545
)
4646
from synapse.util.events import generate_fake_event_id
4747
from synapse.util.metrics import Measure
48+
from synapse.util.sentinel import Sentinel
4849

4950
if TYPE_CHECKING:
5051
from synapse.server import HomeServer
@@ -146,10 +147,37 @@ async def process() -> None:
146147
)
147148

148149
async def _unsafe_process_new_event(self) -> None:
150+
# We purposefully fetch the current max room stream ordering before
151+
# doing anything else, as it could increment duing processing of state
152+
# deltas. We want to avoid updating `delayed_events_stream_pos` past
153+
# the stream ordering of the state deltas we've processed. Otherwise
154+
# we'll leave gaps in our processing.
155+
room_max_stream_ordering = self._store.get_room_max_stream_ordering()
156+
157+
# Check that there are actually any delayed events to process. If not, bail early.
158+
delayed_events_count = await self._store.get_count_of_delayed_events()
159+
if delayed_events_count == 0:
160+
# There are no delayed events to process. Update the
161+
# `delayed_events_stream_pos` to the latest `events` stream pos and
162+
# exit early.
163+
self._event_pos = room_max_stream_ordering
164+
165+
logger.debug(
166+
"No delayed events to process. Updating `delayed_events_stream_pos` to max stream ordering (%s)",
167+
room_max_stream_ordering,
168+
)
169+
170+
await self._store.update_delayed_events_stream_pos(room_max_stream_ordering)
171+
172+
event_processing_positions.labels(
173+
name="delayed_events", **{SERVER_NAME_LABEL: self.server_name}
174+
).set(room_max_stream_ordering)
175+
176+
return
177+
149178
# If self._event_pos is None then means we haven't fetched it from the DB yet
150179
if self._event_pos is None:
151180
self._event_pos = await self._store.get_delayed_events_stream_pos()
152-
room_max_stream_ordering = self._store.get_room_max_stream_ordering()
153181
if self._event_pos > room_max_stream_ordering:
154182
# apparently, we've processed more events than exist in the database!
155183
# this can happen if events are removed with history purge or similar.
@@ -167,7 +195,7 @@ async def _unsafe_process_new_event(self) -> None:
167195
self._clock, name="delayed_events_delta", server_name=self.server_name
168196
):
169197
room_max_stream_ordering = self._store.get_room_max_stream_ordering()
170-
if self._event_pos == room_max_stream_ordering:
198+
if self._event_pos >= room_max_stream_ordering:
171199
return
172200

173201
logger.debug(
@@ -202,23 +230,81 @@ async def _handle_state_deltas(self, deltas: List[StateDelta]) -> None:
202230
Process current state deltas to cancel other users' pending delayed events
203231
that target the same state.
204232
"""
233+
# Get the senders of each delta's state event (as sender information is
234+
# not currently stored in the `current_state_deltas` table).
235+
event_id_and_sender_dict = await self._store.get_senders_for_event_ids(
236+
[delta.event_id for delta in deltas if delta.event_id is not None]
237+
)
238+
239+
# Note: No need to batch as `get_current_state_deltas` will only ever
240+
# return 100 rows at a time.
205241
for delta in deltas:
242+
logger.debug(
243+
"Handling: %r %r, %s", delta.event_type, delta.state_key, delta.event_id
244+
)
245+
246+
# `delta.event_id` and `delta.sender` can be `None` in a few valid
247+
# cases (see the docstring of
248+
# `get_current_state_delta_membership_changes_for_user` for details).
206249
if delta.event_id is None:
207-
logger.debug(
208-
"Not handling delta for deleted state: %r %r",
250+
# TODO: Differentiate between this being caused by a state reset
251+
# which removed a user from a room, or the homeserver
252+
# purposefully having left the room. We can do so by checking
253+
# whether there are any local memberships still left in the
254+
# room. If so, then this is the result of a state reset.
255+
#
256+
# If it is a state reset, we should avoid cancelling new,
257+
# delayed state events due to old state resurfacing. So we
258+
# should skip and log a warning in this case.
259+
#
260+
# If the homeserver has left the room, then we should cancel all
261+
# delayed state events intended for this room, as there is no
262+
# need to try and send a delayed event into a room we've left.
263+
logger.warning(
264+
"Skipping state delta (%r, %r) without corresponding event ID. "
265+
"This can happen if the homeserver has left the room (in which "
266+
"case this can be ignored), or if there has been a state reset "
267+
"which has caused the sender to be kicked out of the room",
209268
delta.event_type,
210269
delta.state_key,
211270
)
212271
continue
213272

214-
logger.debug(
215-
"Handling: %r %r, %s", delta.event_type, delta.state_key, delta.event_id
273+
sender_str = event_id_and_sender_dict.get(
274+
delta.event_id, Sentinel.UNSET_SENTINEL
216275
)
276+
if sender_str is None:
277+
# An event exists, but the `sender` field was "null" and Synapse
278+
# incorrectly accepted the event. This is not expected.
279+
logger.error(
280+
"Skipping state delta with event ID '%s' as 'sender' was None. "
281+
"This is unexpected - please report it as a bug!",
282+
delta.event_id,
283+
)
284+
continue
285+
if sender_str is Sentinel.UNSET_SENTINEL:
286+
# We have an event ID, but the event was not found in the
287+
# datastore. This can happen if a room, or its history, is
288+
# purged. State deltas related to the room are left behind, but
289+
# the event no longer exists.
290+
#
291+
# As we cannot get the sender of this event, we can't calculate
292+
# whether to cancel delayed events related to this one. So we skip.
293+
logger.debug(
294+
"Skipping state delta with event ID '%s' - the room, or its history, may have been purged",
295+
delta.event_id,
296+
)
297+
continue
217298

218-
event = await self._store.get_event(delta.event_id, allow_none=True)
219-
if not event:
299+
try:
300+
sender = UserID.from_string(sender_str)
301+
except SynapseError as e:
302+
logger.error(
303+
"Skipping state delta with Matrix User ID '%s' that failed to parse: %s",
304+
sender_str,
305+
e,
306+
)
220307
continue
221-
sender = UserID.from_string(event.sender)
222308

223309
next_send_ts = await self._store.cancel_delayed_state_events(
224310
room_id=delta.room_id,

synapse/handlers/presence.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1540,7 +1540,7 @@ async def _unsafe_process(self) -> None:
15401540
self.clock, name="presence_delta", server_name=self.server_name
15411541
):
15421542
room_max_stream_ordering = self.store.get_room_max_stream_ordering()
1543-
if self._event_pos == room_max_stream_ordering:
1543+
if self._event_pos >= room_max_stream_ordering:
15441544
return
15451545

15461546
logger.debug(

synapse/handlers/sliding_sync/room_lists.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
#
1414

1515

16-
import enum
1716
import logging
1817
from itertools import chain
1918
from typing import (
@@ -75,6 +74,7 @@
7574
)
7675
from synapse.types.state import StateFilter
7776
from synapse.util import MutableOverlayMapping
77+
from synapse.util.sentinel import Sentinel
7878

7979
if TYPE_CHECKING:
8080
from synapse.server import HomeServer
@@ -83,12 +83,6 @@
8383
logger = logging.getLogger(__name__)
8484

8585

86-
class Sentinel(enum.Enum):
87-
# defining a sentinel in this way allows mypy to correctly handle the
88-
# type of a dictionary lookup and subsequent type narrowing.
89-
UNSET_SENTINEL = object()
90-
91-
9286
# Helper definition for the types that we might return. We do this to avoid
9387
# copying data between types (which can be expensive for many rooms).
9488
RoomsForUserType = Union[RoomsForUserStateReset, RoomsForUser, RoomsForUserSlidingSync]

synapse/storage/controllers/state.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -682,6 +682,8 @@ async def get_current_state_deltas(
682682
- the stream id which these results go up to
683683
- list of current_state_delta_stream rows. If it is empty, we are
684684
up to date.
685+
686+
A maximum of 100 rows will be returned.
685687
"""
686688
# FIXME(faster_joins): what do we do here?
687689
# https://github.com/matrix-org/synapse/issues/13008

synapse/storage/databases/main/delayed_events.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,21 @@ def restart_delayed_event_txn(
182182
"restart_delayed_event", restart_delayed_event_txn
183183
)
184184

185+
async def get_count_of_delayed_events(self) -> int:
186+
"""Returns the number of pending delayed events in the DB."""
187+
188+
def _get_count_of_delayed_events(txn: LoggingTransaction) -> int:
189+
sql = "SELECT count(*) FROM delayed_events"
190+
191+
txn.execute(sql)
192+
resp = txn.fetchone()
193+
return resp[0] if resp is not None else 0
194+
195+
return await self.db_pool.runInteraction(
196+
"get_count_of_delayed_events",
197+
_get_count_of_delayed_events,
198+
)
199+
185200
async def get_all_delayed_events_for_user(
186201
self,
187202
user_localpart: str,

synapse/storage/databases/main/events_worker.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2135,6 +2135,39 @@ def get_deltas_for_stream_id_txn(
21352135

21362136
return rows, to_token, True
21372137

2138+
async def get_senders_for_event_ids(
2139+
self, event_ids: Collection[str]
2140+
) -> Dict[str, Optional[str]]:
2141+
"""
2142+
Given a sequence of event IDs, return the sender associated with each.
2143+
2144+
Args:
2145+
event_ids: A collection of event IDs as strings.
2146+
2147+
Returns:
2148+
A dict of event ID -> sender of the event.
2149+
2150+
If a given event ID does not exist in the `events` table, then no entry
2151+
for that event ID will be returned.
2152+
"""
2153+
2154+
def _get_senders_for_event_ids(
2155+
txn: LoggingTransaction,
2156+
) -> Dict[str, Optional[str]]:
2157+
rows = self.db_pool.simple_select_many_txn(
2158+
txn=txn,
2159+
table="events",
2160+
column="event_id",
2161+
iterable=event_ids,
2162+
keyvalues={},
2163+
retcols=["event_id", "sender"],
2164+
)
2165+
return dict(rows)
2166+
2167+
return await self.db_pool.runInteraction(
2168+
"get_senders_for_event_ids", _get_senders_for_event_ids
2169+
)
2170+
21382171
@cached(max_entries=5000)
21392172
async def get_event_ordering(self, event_id: str, room_id: str) -> Tuple[int, int]:
21402173
res = await self.db_pool.simple_select_one(

synapse/storage/databases/main/state_deltas.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ async def get_partial_current_state_deltas(
9494
- the stream id which these results go up to
9595
- list of current_state_delta_stream rows. If it is empty, we are
9696
up to date.
97+
98+
A maximum of 100 rows will be returned.
9799
"""
98100
prev_stream_id = int(prev_stream_id)
99101

synapse/util/sentinel.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#
2+
# This file is licensed under the Affero General Public License (AGPL) version 3.
3+
#
4+
# Copyright (C) 2025 New Vector, Ltd
5+
#
6+
# This program is free software: you can redistribute it and/or modify
7+
# it under the terms of the GNU Affero General Public License as
8+
# published by the Free Software Foundation, either version 3 of the
9+
# License, or (at your option) any later version.
10+
#
11+
# See the GNU Affero General Public License for more details:
12+
# <https://www.gnu.org/licenses/agpl-3.0.html>.
13+
#
14+
15+
import enum
16+
17+
18+
class Sentinel(enum.Enum):
19+
# defining a sentinel in this way allows mypy to correctly handle the
20+
# type of a dictionary lookup and subsequent type narrowing.
21+
UNSET_SENTINEL = object()

tests/storage/test_events.py

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
#
2121

2222
import logging
23-
from typing import List, Optional
23+
from typing import Dict, List, Optional
2424

2525
from twisted.internet.testing import MemoryReactor
2626

@@ -39,6 +39,77 @@
3939
logger = logging.getLogger(__name__)
4040

4141

42+
class EventsTestCase(HomeserverTestCase):
43+
servlets = [
44+
admin.register_servlets,
45+
room.register_servlets,
46+
login.register_servlets,
47+
]
48+
49+
def prepare(
50+
self, reactor: MemoryReactor, clock: Clock, homeserver: HomeServer
51+
) -> None:
52+
self._store = self.hs.get_datastores().main
53+
54+
def test_get_senders_for_event_ids(self) -> None:
55+
"""Tests the `get_senders_for_event_ids` storage function."""
56+
57+
users_and_tokens: Dict[str, str] = {}
58+
for localpart_suffix in range(10):
59+
localpart = f"user_{localpart_suffix}"
60+
user_id = self.register_user(localpart, "rabbit")
61+
token = self.login(localpart, "rabbit")
62+
63+
users_and_tokens[user_id] = token
64+
65+
room_creator_user_id = self.register_user("room_creator", "rabbit")
66+
room_creator_token = self.login("room_creator", "rabbit")
67+
users_and_tokens[room_creator_user_id] = room_creator_token
68+
69+
# Create a room and invite some users.
70+
room_id = self.helper.create_room_as(
71+
room_creator_user_id, tok=room_creator_token
72+
)
73+
event_ids_to_senders: Dict[str, str] = {}
74+
for user_id, token in users_and_tokens.items():
75+
if user_id == room_creator_user_id:
76+
continue
77+
78+
self.helper.invite(
79+
room=room_id,
80+
targ=user_id,
81+
tok=room_creator_token,
82+
)
83+
84+
# Have the user accept the invite and join the room.
85+
self.helper.join(
86+
room=room_id,
87+
user=user_id,
88+
tok=token,
89+
)
90+
91+
# Have the user send an event.
92+
response = self.helper.send_event(
93+
room_id=room_id,
94+
type="m.room.message",
95+
content={
96+
"msgtype": "m.text",
97+
"body": f"hello, I'm {user_id}!",
98+
},
99+
tok=token,
100+
)
101+
102+
# Record the event ID and sender.
103+
event_id = response["event_id"]
104+
event_ids_to_senders[event_id] = user_id
105+
106+
# Check that `get_senders_for_event_ids` returns the correct data.
107+
response = self.get_success(
108+
self._store.get_senders_for_event_ids(list(event_ids_to_senders.keys()))
109+
)
110+
self.assert_dict(event_ids_to_senders, response)
111+
112+
42113
class ExtremPruneTestCase(HomeserverTestCase):
43114
servlets = [
44115
admin.register_servlets,

0 commit comments

Comments
 (0)