-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Track number of hosts affected by the rate limiter #13541
Changes from all commits
325cadc
3267318
5679bb2
149ac1d
8be321f
de1bbb8
75ca101
5c7d804
7dc6869
b9d4be3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Add metrics to track how the rate limiter is affecting requests (sleep/reject). | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,7 +30,7 @@ | |
run_in_background, | ||
) | ||
from synapse.logging.opentracing import start_active_span | ||
from synapse.metrics import Histogram | ||
from synapse.metrics import Histogram, LaterGauge | ||
from synapse.util import Clock | ||
|
||
if typing.TYPE_CHECKING: | ||
|
@@ -74,6 +74,27 @@ def new_limiter() -> "_PerHostRatelimiter": | |
str, "_PerHostRatelimiter" | ||
] = collections.defaultdict(new_limiter) | ||
|
||
# We track the number of affected hosts per time-period so we can | ||
# differentiate one really noisy homeserver from a general | ||
# ratelimit tuning problem across the federation. | ||
MadLittleMods marked this conversation as resolved.
Show resolved
Hide resolved
|
||
LaterGauge( | ||
MadLittleMods marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"synapse_rate_limit_sleep_affected_hosts", | ||
"Number of hosts that had requests put to sleep", | ||
[], | ||
lambda: sum( | ||
ratelimiter.should_sleep() for ratelimiter in self.ratelimiters.values() | ||
), | ||
) | ||
LaterGauge( | ||
"synapse_rate_limit_reject_affected_hosts", | ||
"Number of hosts that had requests rejected", | ||
[], | ||
lambda: sum( | ||
ratelimiter.should_reject() | ||
for ratelimiter in self.ratelimiters.values() | ||
), | ||
Comment on lines
+92
to
+95
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems to always evalulate to Even though we see individual requests being rejected: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As a note, when I log There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've tried to visualise this at https://grafana.matrix.org/d/000000012/synapse?orgId=1&viewPanel=225&var-datasource=default&var-bucket_size=$__auto_interval_bucket_size&var-instance=matrix.org&var-job=All&var-index=All but I still see 0 for all There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These numbers look consistent with data I saw when investigating in the manhole ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @DMRobertson Looks like #13649 hasn't fixed it if it has already been rolled out to Here are the rate limiter graph section I have: I'll create an actual issue to track this more long-term now ⏩ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. #13670 ⏩ |
||
) | ||
|
||
def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]]": | ||
"""Used to ratelimit an incoming request from a given host | ||
|
||
|
@@ -139,6 +160,21 @@ def ratelimit(self, host: str) -> "Iterator[defer.Deferred[None]]": | |
finally: | ||
self._on_exit(request_id) | ||
|
||
def should_reject(self) -> bool: | ||
""" | ||
Whether to reject the request if we already have too many queued up | ||
(either sleeping or in the ready queue). | ||
""" | ||
queue_size = len(self.ready_request_queue) + len(self.sleeping_requests) | ||
return queue_size > self.reject_limit | ||
|
||
def should_sleep(self) -> bool: | ||
""" | ||
Whether to sleep the request if we already have too many requests coming | ||
through within the window. | ||
""" | ||
return len(self.request_times) > self.sleep_limit | ||
|
||
def _on_enter(self, request_id: object) -> "defer.Deferred[None]": | ||
time_now = self.clock.time_msec() | ||
|
||
|
@@ -149,8 +185,7 @@ def _on_enter(self, request_id: object) -> "defer.Deferred[None]": | |
|
||
# reject the request if we already have too many queued up (either | ||
# sleeping or in the ready queue). | ||
queue_size = len(self.ready_request_queue) + len(self.sleeping_requests) | ||
if queue_size > self.reject_limit: | ||
if self.should_reject(): | ||
logger.debug("Ratelimiter(%s): rejecting request", self.host) | ||
rate_limit_reject_counter.inc() | ||
raise LimitExceededError( | ||
|
@@ -180,7 +215,7 @@ def queue_request() -> "defer.Deferred[None]": | |
len(self.request_times), | ||
) | ||
|
||
if len(self.request_times) > self.sleep_limit: | ||
if self.should_sleep(): | ||
logger.debug( | ||
"Ratelimiter(%s) [%s]: sleeping request for %f sec", | ||
self.host, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same changelog as #13534 so they merge together.