From b96628ff29126f25ad0bc4c0309965b96ab5e214 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Thu, 6 Oct 2022 23:11:49 -0500 Subject: [PATCH] Fix Complement MSC3030 `can_paginate_after_getting_remote_event_from_timestamp_to_event_endpoint` test flake. Make sure we backfill the closest local event if it's an `outlier` so we can use it with `/context` and paginate `/messages` from this point. Previously we only did this for a remote event. Fix https://github.com/matrix-org/synapse/issues/13944 Why did this fail before? Why was it flakey? ==================================================== Sleuthing the server logs on the CI failure, it looks like `hs2:/timestamp_to_event` found `$NP6-oU7mIFVyhtKfGvfrEQX949hQX-T-gvuauG6eurU` as an `outlier` event locally. Then when we went and asked for it via `/context`, since it's an `outlier`, it was filtered out of the results -> `You don't have permission to access that event.` This is reproducible when `sync_partial_state` races and persists `$NP6-oU7mIFVyhtKfGvfrEQX949hQX-T-gvuauG6eurU` as an `outlier` before we evaluate `get_event_for_timestamp(...)`. To consistently reproduce locally, just add a delay at the [start of `get_event_for_timestamp(...)`](https://github.com/matrix-org/synapse/blob/cb20b885cb4bd1648581dd043a184d86fc8c7a00/synapse/handlers/room.py#L1470-L1496) so it always runs after `sync_partial_state` completes. ```py from twisted.internet import task as twisted_task d = twisted_task.deferLater(self.hs.get_reactor(), 3.5) await d ``` In a run where it passes, on `hs2`, `get_event_for_timestamp(...)` finds a different event locally which is next to a gap and we request from a closer one from `hs1` which gets backfilled. And since the backfilled event is not an `outlier`, it's returned as expected during `/context`. Future changes ========================== In a future PR, it would be nice if `/context` would just backfill the event for us. This would also help with https://github.com/matrix-org/synapse/issues/3848 --- synapse/federation/federation_client.py | 38 +++--- synapse/handlers/room.py | 146 ++++++++++++------------ 2 files changed, 89 insertions(+), 95 deletions(-) diff --git a/synapse/federation/federation_client.py b/synapse/federation/federation_client.py index 35bf04aba4cd..da24038d3124 100644 --- a/synapse/federation/federation_client.py +++ b/synapse/federation/federation_client.py @@ -1653,33 +1653,33 @@ async def timestamp_to_event( Returns: A parsed TimestampToEventResponse including the closest event_id and origin_server_ts - - Raises: - SynapseError if the chosen remote server returns a 300/400 code, or - no servers were reachable. """ async def _timestamp_to_event_from_destination( destination: str, - ) -> Optional["TimestampToEventResponse"]: + ) -> TimestampToEventResponse: return await self._timestamp_to_event_from_destination( destination, room_id, timestamp, direction ) - # Loop through each homeserver candidate until we get a succesful response - return await self._try_destination_list( - "timestamp_to_event", - destinations, - # TODO: The requested timestamp may lie in a part of the - # event graph that the remote server *also* didn't have, - # in which case they will have returned another event - # which may be nowhere near the requested timestamp. In - # the future, we may need to reconcile that gap and ask - # other homeservers, and/or extend `/timestamp_to_event` - # to return events on *both* sides of the timestamp to - # help reconcile the gap faster. - _timestamp_to_event_from_destination, - ) + try: + # Loop through each homeserver candidate until we get a succesful response + timestamp_to_event_response = await self._try_destination_list( + "timestamp_to_event", + destinations, + # TODO: The requested timestamp may lie in a part of the + # event graph that the remote server *also* didn't have, + # in which case they will have returned another event + # which may be nowhere near the requested timestamp. In + # the future, we may need to reconcile that gap and ask + # other homeservers, and/or extend `/timestamp_to_event` + # to return events on *both* sides of the timestamp to + # help reconcile the gap faster. + _timestamp_to_event_from_destination, + ) + return timestamp_to_event_response + except SynapseError: + return None async def _timestamp_to_event_from_destination( self, destination: str, room_id: str, timestamp: int, direction: str diff --git a/synapse/handlers/room.py b/synapse/handlers/room.py index 57ab05ad25d9..caf74a5ef2d8 100644 --- a/synapse/handlers/room.py +++ b/synapse/handlers/room.py @@ -49,7 +49,6 @@ from synapse.api.errors import ( AuthError, Codes, - HttpResponseException, LimitExceededError, NotFoundError, StoreError, @@ -60,7 +59,6 @@ from synapse.event_auth import validate_event_for_room_version from synapse.events import EventBase from synapse.events.utils import copy_and_fixup_power_levels_contents -from synapse.federation.federation_client import InvalidResponseError from synapse.handlers.relations import BundledAggregations from synapse.module_api import NOT_SPAM from synapse.rest.admin._base import assert_user_is_admin @@ -1460,6 +1458,8 @@ async def filter_evts(events: List[EventBase]) -> List[EventBase]: class TimestampLookupHandler: def __init__(self, hs: "HomeServer"): + # TODO: Remove + self.hs = hs self.server_name = hs.hostname self.store = hs.get_datastores().main self.state_handler = hs.get_state_handler() @@ -1493,7 +1493,16 @@ async def get_event_for_timestamp( Raises: SynapseError if unable to find any event locally in the given direction """ + # TODO: Remove: this is just here to consistently reproduce the MSC3030 + # Complement test flake + from twisted.internet import task as twisted_task + + d = twisted_task.deferLater(self.hs.get_reactor(), 3.5) + await d + logger.debug( + f"get_event_for_timestamp(room_id={room_id}, timestamp={timestamp}, direction={direction}) Finding closest event..." + ) local_event_id = await self.store.get_event_id_for_timestamp( room_id, timestamp, direction ) @@ -1526,6 +1535,8 @@ async def get_event_for_timestamp( await self.store.is_event_next_to_forward_gap(local_event) ) + likely_domains = None + # If we found a gap, we should probably ask another homeserver first # about more history in between if ( @@ -1545,85 +1556,54 @@ async def get_event_for_timestamp( ) ) - # Loop through each homeserver candidate until we get a succesful response - for domain in likely_domains: - # We don't want to ask our own server for information we don't have - if domain == self.server_name: - continue + remote_response = await self.federation_client.timestamp_to_event( + destinations=likely_domains, + room_id=room_id, + timestamp=timestamp, + direction=direction, + ) + if remote_response is not None: + logger.debug( + "get_event_for_timestamp: remote_response=%s", + remote_response, + ) - try: - remote_response = await self.federation_client.timestamp_to_event( - domain, room_id, timestamp, direction - ) - logger.debug( - "get_event_for_timestamp: response from domain(%s)=%s", - domain, - remote_response, - ) + remote_event_id = remote_response.event_id + remote_origin_server_ts = remote_response.origin_server_ts - remote_event_id = remote_response.event_id - remote_origin_server_ts = remote_response.origin_server_ts - - # Backfill this event so we can get a pagination token for - # it with `/context` and paginate `/messages` from this - # point. - # - # TODO: The requested timestamp may lie in a part of the - # event graph that the remote server *also* didn't have, - # in which case they will have returned another event - # which may be nowhere near the requested timestamp. In - # the future, we may need to reconcile that gap and ask - # other homeservers, and/or extend `/timestamp_to_event` - # to return events on *both* sides of the timestamp to - # help reconcile the gap faster. - remote_event = ( - await self.federation_event_handler.backfill_event_id( - domain, room_id, remote_event_id - ) - ) + # Backfill this event so we can get a pagination token for + # it with `/context` and paginate `/messages` from this + # point. + pulled_pdu_info = await self.federation_event_handler.backfill_event_id( + likely_domains, room_id, remote_event_id + ) + remote_event = pulled_pdu_info.pdu - # XXX: When we see that the remote server is not trustworthy, - # maybe we should not ask them first in the future. - if remote_origin_server_ts != remote_event.origin_server_ts: - logger.info( - "get_event_for_timestamp: Remote server (%s) claimed that remote_event_id=%s occured at remote_origin_server_ts=%s but that isn't true (actually occured at %s). Their claims are dubious and we should consider not trusting them.", - domain, - remote_event_id, - remote_origin_server_ts, - remote_event.origin_server_ts, - ) - - # Only return the remote event if it's closer than the local event - if not local_event or ( - abs(remote_event.origin_server_ts - timestamp) - < abs(local_event.origin_server_ts - timestamp) - ): - logger.info( - "get_event_for_timestamp: returning remote_event_id=%s (%s) since it's closer to timestamp=%s than local_event=%s (%s)", - remote_event_id, - remote_event.origin_server_ts, - timestamp, - local_event.event_id if local_event else None, - local_event.origin_server_ts if local_event else None, - ) - return remote_event_id, remote_origin_server_ts - except (HttpResponseException, InvalidResponseError) as ex: - # Let's not put a high priority on some other homeserver - # failing to respond or giving a random response - logger.debug( - "get_event_for_timestamp: Failed to fetch /timestamp_to_event from %s because of exception(%s) %s args=%s", - domain, - type(ex).__name__, - ex, - ex.args, + # XXX: When we see that the remote server is not trustworthy, + # maybe we should not ask them first in the future. + if remote_origin_server_ts != remote_event.origin_server_ts: + logger.info( + "get_event_for_timestamp: Remote server (%s) claimed that remote_event_id=%s occured at remote_origin_server_ts=%s but that isn't true (actually occured at %s). Their claims are dubious and we should consider not trusting them.", + pulled_pdu_info.pull_origin, + remote_event_id, + remote_origin_server_ts, + remote_event.origin_server_ts, ) - except Exception: - # But we do want to see some exceptions in our code - logger.warning( - "get_event_for_timestamp: Failed to fetch /timestamp_to_event from %s because of exception", - domain, - exc_info=True, + + # Only return the remote event if it's closer than the local event + if not local_event or ( + abs(remote_event.origin_server_ts - timestamp) + < abs(local_event.origin_server_ts - timestamp) + ): + logger.info( + "get_event_for_timestamp: returning remote_event_id=%s (%s) since it's closer to timestamp=%s than local_event=%s (%s)", + remote_event_id, + remote_event.origin_server_ts, + timestamp, + local_event.event_id if local_event else None, + local_event.origin_server_ts if local_event else None, ) + return remote_event_id, remote_origin_server_ts # To appease mypy, we have to add both of these conditions to check for # `None`. We only expect `local_event` to be `None` when @@ -1635,6 +1615,20 @@ async def get_event_for_timestamp( errcode=Codes.NOT_FOUND, ) + if local_event.internal_metadata.outlier: + # We might already have these on hand from asking a remote server before + if likely_domains is None: + likely_domains = await self._storage_controllers.state.get_current_hosts_in_room_ordered( + room_id + ) + + # Backfill this event so we can get a pagination token for + # it with `/context` and paginate `/messages` from this + # point. + await self.federation_event_handler.backfill_event_id( + likely_domains, room_id, local_event.event_id + ) + return local_event_id, local_event.origin_server_ts