Skip to content

Commit

Permalink
feat: Remove sampling
Browse files Browse the repository at this point in the history
  • Loading branch information
lynnagara committed Sep 26, 2019
1 parent 5c629a3 commit 2478180
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 122 deletions.
17 changes: 0 additions & 17 deletions src/sentry/conf/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -935,23 +935,6 @@ def create_partitioned_queues(name):
# 'scheduled-name': 'monitor_guid',
}

# Only store a portion of all messages per unique group.
SENTRY_SAMPLE_DATA = False

# The following values control the sampling rates
SENTRY_SAMPLE_RATES = (
# up until N events, store 1 in M
(50, 1),
(1000, 2),
(10000, 10),
(100000, 50),
(1000000, 300),
(10000000, 2000),
)
SENTRY_MAX_SAMPLE_RATE = 10000
SENTRY_SAMPLE_TIMES = ((3600, 1), (360, 10), (60, 60))
SENTRY_MAX_SAMPLE_TIME = 10000

# Web Service
SENTRY_WEB_HOST = "localhost"
SENTRY_WEB_PORT = 9000
Expand Down
109 changes: 28 additions & 81 deletions src/sentry/event_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from django.utils import timezone
from django.utils.encoding import force_text

from sentry import buffer, eventtypes, eventstream, features, tagstore, tsdb
from sentry import buffer, eventtypes, eventstream, tagstore, tsdb
from sentry.constants import (
DEFAULT_STORE_NORMALIZER_ARGS,
LOG_LEVELS,
Expand Down Expand Up @@ -102,22 +102,6 @@ def get_tag(data, key):
return v


def count_limit(count):
# TODO: could we do something like num_to_store = max(math.sqrt(100*count)+59, 200) ?
# ~ 150 * ((log(n) - 1.5) ^ 2 - 0.25)
for amount, sample_rate in settings.SENTRY_SAMPLE_RATES:
if count <= amount:
return sample_rate
return settings.SENTRY_MAX_SAMPLE_RATE


def time_limit(silence): # ~ 3600 per hour
for amount, sample_rate in settings.SENTRY_SAMPLE_TIMES:
if silence >= amount:
return sample_rate
return settings.SENTRY_MAX_SAMPLE_TIME


def validate_and_set_timestamp(data, timestamp):
"""
Helper function for event processors/enhancers to avoid setting broken timestamps.
Expand Down Expand Up @@ -156,26 +140,6 @@ def parse_client_as_sdk(value):
return {"name": name, "version": version}


if not settings.SENTRY_SAMPLE_DATA:

def should_sample(current_datetime, last_seen, times_seen):
return False


else:

def should_sample(current_datetime, last_seen, times_seen):
silence = current_datetime - last_seen

if times_seen % count_limit(times_seen) == 0:
return False

if times_seen % time_limit(silence) == 0:
return False

return True


def plugin_is_regression(group, event):
project = event.project
for plugin in plugins.for_project(project):
Expand Down Expand Up @@ -689,7 +653,7 @@ def save(self, project_id, raw=False, assume_normalized=False):
kwargs["first_release"] = release

try:
group, is_new, is_regression, is_sample = self._save_aggregate(
group, is_new, is_regression = self._save_aggregate(
event=event, hashes=hashes, release=release, **kwargs
)
except HashDiscarded:
Expand All @@ -708,7 +672,6 @@ def save(self, project_id, raw=False, assume_normalized=False):
group = None
is_new = False
is_regression = False
is_sample = False
event_saved.send_robust(project=project, event_size=event.size, sender=EventManager)

# store a reference to the group id to guarantee validation of isolation
Expand Down Expand Up @@ -779,33 +742,32 @@ def save(self, project_id, raw=False, assume_normalized=False):
group=group, environment=environment
)

# save the event unless its been sampled
if not is_sample:
try:
with transaction.atomic(using=router.db_for_write(Event)):
event.save()
except IntegrityError:
logger.info(
"duplicate.found",
exc_info=True,
extra={
"event_uuid": event_id,
"project_id": project.id,
"group_id": group.id if group else None,
"model": Event.__name__,
},
)
return event

tagstore.delay_index_event_tags(
organization_id=project.organization_id,
project_id=project.id,
group_id=group.id if group else None,
environment_id=environment.id,
event_id=event.id,
tags=event.tags,
date_added=event.datetime,
# save the event
try:
with transaction.atomic(using=router.db_for_write(Event)):
event.save()
except IntegrityError:
logger.info(
"duplicate.found",
exc_info=True,
extra={
"event_uuid": event_id,
"project_id": project.id,
"group_id": group.id if group else None,
"model": Event.__name__,
},
)
return event

tagstore.delay_index_event_tags(
organization_id=project.organization_id,
project_id=project.id,
group_id=group.id if group else None,
environment_id=environment.id,
event_id=event.id,
tags=event.tags,
date_added=event.datetime,
)

if event_user:
counters = [
Expand Down Expand Up @@ -855,7 +817,6 @@ def save(self, project_id, raw=False, assume_normalized=False):
group=group,
event=event,
is_new=is_new,
is_sample=is_sample,
is_regression=is_regression,
is_new_group_environment=is_new_group_environment,
primary_hash=hashes[0],
Expand Down Expand Up @@ -994,28 +955,14 @@ def _save_aggregate(self, event, hashes, release, **kwargs):
if group_is_new and len(new_hashes) == len(all_hashes):
is_new = True

# XXX(dcramer): it's important this gets called **before** the aggregate
# is processed as otherwise values like last_seen will get mutated
can_sample = features.has("projects:sample-events", project=project) and should_sample(
event.data.get("received") or float(event.datetime.strftime("%s")),
group.data.get("last_received") or float(group.last_seen.strftime("%s")),
group.times_seen,
)

if not is_new:
is_regression = self._process_existing_aggregate(
group=group, event=event, data=kwargs, release=release
)
else:
is_regression = False

# Determine if we've sampled enough data to store this event
if is_new or is_regression:
is_sample = False
else:
is_sample = can_sample

return group, is_new, is_regression, is_sample
return group, is_new, is_regression

def _handle_regression(self, group, event, release):
if not group.is_resolved():
Expand Down
22 changes: 0 additions & 22 deletions tests/sentry/event_manager/test_event_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,26 +74,6 @@ def test_similar_message_prefix_doesnt_group(self):

assert event1.group_id != event2.group_id

@mock.patch("sentry.event_manager.should_sample")
def test_does_not_save_event_when_sampled(self, should_sample):
with self.feature("projects:sample-events"):
should_sample.return_value = True
event_id = "a" * 32

manager = EventManager(make_event(event_id=event_id))
manager.save(1)

# This is a brand new event, so it is actually saved.
assert Event.objects.filter(event_id=event_id).exists()

event_id = "b" * 32

manager = EventManager(make_event(event_id=event_id))
manager.save(1)

# This second is a dupe, so should be sampled
assert not Event.objects.filter(event_id=event_id).exists()

def test_ephemral_interfaces_removed_on_save(self):
manager = EventManager(make_event(platform="python"))
manager.normalize()
Expand Down Expand Up @@ -794,7 +774,6 @@ def save_event():
group=event.group,
event=event,
is_new=True,
is_sample=False,
is_regression=False,
is_new_group_environment=True,
primary_hash="acbd18db4cc2f85cedef654fccc4a4d8",
Expand All @@ -809,7 +788,6 @@ def save_event():
group=event.group,
event=event,
is_new=False,
is_sample=False,
is_regression=None, # XXX: wut
is_new_group_environment=False,
primary_hash="acbd18db4cc2f85cedef654fccc4a4d8",
Expand Down
2 changes: 0 additions & 2 deletions tests/snuba/eventstream/test_eventstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ def _get_event_count():
"is_new_group_environment": True,
"is_new": True,
"is_regression": False,
"is_sample": False,
"primary_hash": "acbd18db4cc2f85cedef654fccc4a4d8",
"skip_consume": False,
}
Expand All @@ -101,7 +100,6 @@ def test_issueless(self, mock_delay_index_event_tags, mock_eventstream_insert):
"is_new_group_environment": True,
"is_new": True,
"is_regression": False,
"is_sample": False,
"primary_hash": "acbd18db4cc2f85cedef654fccc4a4d8",
"skip_consume": False,
}
Expand Down

0 comments on commit 2478180

Please sign in to comment.