Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Commit

Permalink
Fix Complement MSC3030 `can_paginate_after_getting_remote_event_from_…
Browse files Browse the repository at this point in the history
…timestamp_to_event_endpoint` test flake. Make sure we backfill the closest local event if it's an `outlier` so we can use it with `/context` and paginate `/messages` from this point. Previously we only did this for a remote event.

Fix  #13944

Why did this fail before? Why was it flakey?
====================================================

Sleuthing the server logs on the CI failure, it looks like `hs2:/timestamp_to_event` found `$NP6-oU7mIFVyhtKfGvfrEQX949hQX-T-gvuauG6eurU` as an `outlier` event locally. Then when we went and asked for it via `/context`, since it's an `outlier`, it was filtered out of the results -> `You don't have permission to access that event.`

This is reproducible when `sync_partial_state` races and persists `$NP6-oU7mIFVyhtKfGvfrEQX949hQX-T-gvuauG6eurU` as an `outlier` before we evaluate `get_event_for_timestamp(...)`. To consistently reproduce locally, just add a delay at the [start of `get_event_for_timestamp(...)`](https://github.com/matrix-org/synapse/blob/cb20b885cb4bd1648581dd043a184d86fc8c7a00/synapse/handlers/room.py#L1470-L1496) so it always runs after `sync_partial_state` completes.

```py
from twisted.internet import task as twisted_task
d = twisted_task.deferLater(self.hs.get_reactor(), 3.5)
await d
```

In a run where it passes, on `hs2`, `get_event_for_timestamp(...)` finds a different event locally which is next to a gap and we request from a closer one from `hs1` which gets backfilled. And since the backfilled event is not an `outlier`, it's returned as expected during `/context`.

Future changes
==========================

In a future PR, it would be nice if `/context` would just backfill the event for us. This would also help with #3848
  • Loading branch information
MadLittleMods committed Oct 7, 2022
1 parent 1ac40e6 commit b96628f
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 95 deletions.
38 changes: 19 additions & 19 deletions synapse/federation/federation_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -1653,33 +1653,33 @@ async def timestamp_to_event(
Returns:
A parsed TimestampToEventResponse including the closest event_id
and origin_server_ts
Raises:
SynapseError if the chosen remote server returns a 300/400 code, or
no servers were reachable.
"""

async def _timestamp_to_event_from_destination(
destination: str,
) -> Optional["TimestampToEventResponse"]:
) -> TimestampToEventResponse:
return await self._timestamp_to_event_from_destination(
destination, room_id, timestamp, direction
)

# Loop through each homeserver candidate until we get a succesful response
return await self._try_destination_list(
"timestamp_to_event",
destinations,
# TODO: The requested timestamp may lie in a part of the
# event graph that the remote server *also* didn't have,
# in which case they will have returned another event
# which may be nowhere near the requested timestamp. In
# the future, we may need to reconcile that gap and ask
# other homeservers, and/or extend `/timestamp_to_event`
# to return events on *both* sides of the timestamp to
# help reconcile the gap faster.
_timestamp_to_event_from_destination,
)
try:
# Loop through each homeserver candidate until we get a succesful response
timestamp_to_event_response = await self._try_destination_list(
"timestamp_to_event",
destinations,
# TODO: The requested timestamp may lie in a part of the
# event graph that the remote server *also* didn't have,
# in which case they will have returned another event
# which may be nowhere near the requested timestamp. In
# the future, we may need to reconcile that gap and ask
# other homeservers, and/or extend `/timestamp_to_event`
# to return events on *both* sides of the timestamp to
# help reconcile the gap faster.
_timestamp_to_event_from_destination,
)
return timestamp_to_event_response
except SynapseError:
return None

async def _timestamp_to_event_from_destination(
self, destination: str, room_id: str, timestamp: int, direction: str
Expand Down
146 changes: 70 additions & 76 deletions synapse/handlers/room.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
from synapse.api.errors import (
AuthError,
Codes,
HttpResponseException,
LimitExceededError,
NotFoundError,
StoreError,
Expand All @@ -60,7 +59,6 @@
from synapse.event_auth import validate_event_for_room_version
from synapse.events import EventBase
from synapse.events.utils import copy_and_fixup_power_levels_contents
from synapse.federation.federation_client import InvalidResponseError
from synapse.handlers.relations import BundledAggregations
from synapse.module_api import NOT_SPAM
from synapse.rest.admin._base import assert_user_is_admin
Expand Down Expand Up @@ -1460,6 +1458,8 @@ async def filter_evts(events: List[EventBase]) -> List[EventBase]:

class TimestampLookupHandler:
def __init__(self, hs: "HomeServer"):
# TODO: Remove
self.hs = hs
self.server_name = hs.hostname
self.store = hs.get_datastores().main
self.state_handler = hs.get_state_handler()
Expand Down Expand Up @@ -1493,7 +1493,16 @@ async def get_event_for_timestamp(
Raises:
SynapseError if unable to find any event locally in the given direction
"""
# TODO: Remove: this is just here to consistently reproduce the MSC3030
# Complement test flake
from twisted.internet import task as twisted_task

d = twisted_task.deferLater(self.hs.get_reactor(), 3.5)
await d

logger.debug(
f"get_event_for_timestamp(room_id={room_id}, timestamp={timestamp}, direction={direction}) Finding closest event..."
)
local_event_id = await self.store.get_event_id_for_timestamp(
room_id, timestamp, direction
)
Expand Down Expand Up @@ -1526,6 +1535,8 @@ async def get_event_for_timestamp(
await self.store.is_event_next_to_forward_gap(local_event)
)

likely_domains = None

# If we found a gap, we should probably ask another homeserver first
# about more history in between
if (
Expand All @@ -1545,85 +1556,54 @@ async def get_event_for_timestamp(
)
)

# Loop through each homeserver candidate until we get a succesful response
for domain in likely_domains:
# We don't want to ask our own server for information we don't have
if domain == self.server_name:
continue
remote_response = await self.federation_client.timestamp_to_event(
destinations=likely_domains,
room_id=room_id,
timestamp=timestamp,
direction=direction,
)
if remote_response is not None:
logger.debug(
"get_event_for_timestamp: remote_response=%s",
remote_response,
)

try:
remote_response = await self.federation_client.timestamp_to_event(
domain, room_id, timestamp, direction
)
logger.debug(
"get_event_for_timestamp: response from domain(%s)=%s",
domain,
remote_response,
)
remote_event_id = remote_response.event_id
remote_origin_server_ts = remote_response.origin_server_ts

remote_event_id = remote_response.event_id
remote_origin_server_ts = remote_response.origin_server_ts

# Backfill this event so we can get a pagination token for
# it with `/context` and paginate `/messages` from this
# point.
#
# TODO: The requested timestamp may lie in a part of the
# event graph that the remote server *also* didn't have,
# in which case they will have returned another event
# which may be nowhere near the requested timestamp. In
# the future, we may need to reconcile that gap and ask
# other homeservers, and/or extend `/timestamp_to_event`
# to return events on *both* sides of the timestamp to
# help reconcile the gap faster.
remote_event = (
await self.federation_event_handler.backfill_event_id(
domain, room_id, remote_event_id
)
)
# Backfill this event so we can get a pagination token for
# it with `/context` and paginate `/messages` from this
# point.
pulled_pdu_info = await self.federation_event_handler.backfill_event_id(
likely_domains, room_id, remote_event_id
)
remote_event = pulled_pdu_info.pdu

# XXX: When we see that the remote server is not trustworthy,
# maybe we should not ask them first in the future.
if remote_origin_server_ts != remote_event.origin_server_ts:
logger.info(
"get_event_for_timestamp: Remote server (%s) claimed that remote_event_id=%s occured at remote_origin_server_ts=%s but that isn't true (actually occured at %s). Their claims are dubious and we should consider not trusting them.",
domain,
remote_event_id,
remote_origin_server_ts,
remote_event.origin_server_ts,
)

# Only return the remote event if it's closer than the local event
if not local_event or (
abs(remote_event.origin_server_ts - timestamp)
< abs(local_event.origin_server_ts - timestamp)
):
logger.info(
"get_event_for_timestamp: returning remote_event_id=%s (%s) since it's closer to timestamp=%s than local_event=%s (%s)",
remote_event_id,
remote_event.origin_server_ts,
timestamp,
local_event.event_id if local_event else None,
local_event.origin_server_ts if local_event else None,
)
return remote_event_id, remote_origin_server_ts
except (HttpResponseException, InvalidResponseError) as ex:
# Let's not put a high priority on some other homeserver
# failing to respond or giving a random response
logger.debug(
"get_event_for_timestamp: Failed to fetch /timestamp_to_event from %s because of exception(%s) %s args=%s",
domain,
type(ex).__name__,
ex,
ex.args,
# XXX: When we see that the remote server is not trustworthy,
# maybe we should not ask them first in the future.
if remote_origin_server_ts != remote_event.origin_server_ts:
logger.info(
"get_event_for_timestamp: Remote server (%s) claimed that remote_event_id=%s occured at remote_origin_server_ts=%s but that isn't true (actually occured at %s). Their claims are dubious and we should consider not trusting them.",
pulled_pdu_info.pull_origin,
remote_event_id,
remote_origin_server_ts,
remote_event.origin_server_ts,
)
except Exception:
# But we do want to see some exceptions in our code
logger.warning(
"get_event_for_timestamp: Failed to fetch /timestamp_to_event from %s because of exception",
domain,
exc_info=True,

# Only return the remote event if it's closer than the local event
if not local_event or (
abs(remote_event.origin_server_ts - timestamp)
< abs(local_event.origin_server_ts - timestamp)
):
logger.info(
"get_event_for_timestamp: returning remote_event_id=%s (%s) since it's closer to timestamp=%s than local_event=%s (%s)",
remote_event_id,
remote_event.origin_server_ts,
timestamp,
local_event.event_id if local_event else None,
local_event.origin_server_ts if local_event else None,
)
return remote_event_id, remote_origin_server_ts

# To appease mypy, we have to add both of these conditions to check for
# `None`. We only expect `local_event` to be `None` when
Expand All @@ -1635,6 +1615,20 @@ async def get_event_for_timestamp(
errcode=Codes.NOT_FOUND,
)

if local_event.internal_metadata.outlier:
# We might already have these on hand from asking a remote server before
if likely_domains is None:
likely_domains = await self._storage_controllers.state.get_current_hosts_in_room_ordered(
room_id
)

# Backfill this event so we can get a pagination token for
# it with `/context` and paginate `/messages` from this
# point.
await self.federation_event_handler.backfill_event_id(
likely_domains, room_id, local_event.event_id
)

return local_event_id, local_event.origin_server_ts


Expand Down

0 comments on commit b96628f

Please sign in to comment.