From bc209e52081bc0faa5fe58d81673fb2ecfd283a0 Mon Sep 17 00:00:00 2001
From: Tony Xiao <txiao@sentry.io>
Date: Thu, 27 Apr 2023 12:02:45 -0400
Subject: [PATCH] fix(profiling): Do not keep reference to frame to prevent
 memory leak (#2049)

The profiler can capture frames from it's own thread. When it does so, it holds
on to a reference to the frame in the previous sample. One of the frames it
holds on it is a frame from the profiler itself, which prevents the references
to other frames to other frames from being freed. A consequence of this is that
the local variables of those frames are not able to be freed either. This change
ensures we do not keep a reference to the profiler around in order to prevent
this issue.
---
 mypy.ini                 |   2 +
 sentry_sdk/_lru_cache.py | 156 +++++++++++++++++++++++++++++++++++++++
 sentry_sdk/profiler.py   | 138 ++++++++++++++--------------------
 tests/test_lru_cache.py  |  37 ++++++++++
 tests/test_profiler.py   |  60 ++++++++-------
 5 files changed, 283 insertions(+), 110 deletions(-)
 create mode 100644 sentry_sdk/_lru_cache.py
 create mode 100644 tests/test_lru_cache.py

diff --git a/mypy.ini b/mypy.ini
index b23e18f66a..fef90c867e 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -59,6 +59,8 @@ ignore_missing_imports = True
 [mypy-sentry_sdk._queue]
 ignore_missing_imports = True
 disallow_untyped_defs = False
+[mypy-sentry_sdk._lru_cache]
+disallow_untyped_defs = False
 [mypy-celery.app.trace]
 ignore_missing_imports = True
 [mypy-flask.signals]
diff --git a/sentry_sdk/_lru_cache.py b/sentry_sdk/_lru_cache.py
new file mode 100644
index 0000000000..91cf55d09a
--- /dev/null
+++ b/sentry_sdk/_lru_cache.py
@@ -0,0 +1,156 @@
+"""
+A fork of Python 3.6's stdlib lru_cache (found in Python's 'cpython/Lib/functools.py')
+adapted into a data structure for single threaded uses.
+
+https://github.com/python/cpython/blob/v3.6.12/Lib/functools.py
+
+
+Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020 Python Software Foundation;
+
+All Rights Reserved
+
+
+PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
+--------------------------------------------
+
+1. This LICENSE AGREEMENT is between the Python Software Foundation
+("PSF"), and the Individual or Organization ("Licensee") accessing and
+otherwise using this software ("Python") in source or binary form and
+its associated documentation.
+
+2. Subject to the terms and conditions of this License Agreement, PSF hereby
+grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce,
+analyze, test, perform and/or display publicly, prepare derivative works,
+distribute, and otherwise use Python alone or in any derivative version,
+provided, however, that PSF's License Agreement and PSF's notice of copyright,
+i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020 Python Software Foundation;
+All Rights Reserved" are retained in Python alone or in any derivative version
+prepared by Licensee.
+
+3. In the event Licensee prepares a derivative work that is based on
+or incorporates Python or any part thereof, and wants to make
+the derivative work available to others as provided herein, then
+Licensee hereby agrees to include in any such work a brief summary of
+the changes made to Python.
+
+4. PSF is making Python available to Licensee on an "AS IS"
+basis.  PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
+DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
+INFRINGE ANY THIRD PARTY RIGHTS.
+
+5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
+FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
+A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
+OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+
+6. This License Agreement will automatically terminate upon a material
+breach of its terms and conditions.
+
+7. Nothing in this License Agreement shall be deemed to create any
+relationship of agency, partnership, or joint venture between PSF and
+Licensee.  This License Agreement does not grant permission to use PSF
+trademarks or trade name in a trademark sense to endorse or promote
+products or services of Licensee, or any third party.
+
+8. By copying, installing or otherwise using Python, Licensee
+agrees to be bound by the terms and conditions of this License
+Agreement.
+
+"""
+
+SENTINEL = object()
+
+
+# aliases to the entries in a node
+PREV = 0
+NEXT = 1
+KEY = 2
+VALUE = 3
+
+
+class LRUCache(object):
+    def __init__(self, max_size):
+        assert max_size > 0
+
+        self.max_size = max_size
+        self.full = False
+
+        self.cache = {}
+
+        # root of the circularly linked list to keep track of
+        # the least recently used key
+        self.root = []  # type: ignore
+        # the node looks like [PREV, NEXT, KEY, VALUE]
+        self.root[:] = [self.root, self.root, None, None]
+
+        self.hits = self.misses = 0
+
+    def set(self, key, value):
+        link = self.cache.get(key, SENTINEL)
+
+        if link is not SENTINEL:
+            # have to move the node to the front of the linked list
+            link_prev, link_next, _key, _value = link
+
+            # first remove the node from the lsnked list
+            link_prev[NEXT] = link_next
+            link_next[PREV] = link_prev
+
+            # insert the node between the root and the last
+            last = self.root[PREV]
+            last[NEXT] = self.root[PREV] = link
+            link[PREV] = last
+            link[NEXT] = self.root
+
+            # update the value
+            link[VALUE] = value
+
+        elif self.full:
+            # reuse the root node, so update its key/value
+            old_root = self.root
+            old_root[KEY] = key
+            old_root[VALUE] = value
+
+            self.root = old_root[NEXT]
+            old_key = self.root[KEY]
+
+            self.root[KEY] = self.root[VALUE] = None
+
+            del self.cache[old_key]
+
+            self.cache[key] = old_root
+
+        else:
+            # insert new node after last
+            last = self.root[PREV]
+            link = [last, self.root, key, value]
+            last[NEXT] = self.root[PREV] = self.cache[key] = link
+            self.full = len(self.cache) >= self.max_size
+
+    def get(self, key, default=None):
+        link = self.cache.get(key, SENTINEL)
+
+        if link is SENTINEL:
+            self.misses += 1
+            return default
+
+        # have to move the node to the front of the linked list
+        link_prev, link_next, _key, _value = link
+
+        # first remove the node from the lsnked list
+        link_prev[NEXT] = link_next
+        link_next[PREV] = link_prev
+
+        # insert the node between the root and the last
+        last = self.root[PREV]
+        last[NEXT] = self.root[PREV] = link
+        link[PREV] = last
+        link[NEXT] = self.root
+
+        self.hits += 1
+
+        return link[VALUE]
diff --git a/sentry_sdk/profiler.py b/sentry_sdk/profiler.py
index 2ce6e01a2f..ee74a86e52 100644
--- a/sentry_sdk/profiler.py
+++ b/sentry_sdk/profiler.py
@@ -37,6 +37,7 @@
 
 import sentry_sdk
 from sentry_sdk._compat import PY33, PY311
+from sentry_sdk._lru_cache import LRUCache
 from sentry_sdk._types import TYPE_CHECKING
 from sentry_sdk.utils import (
     capture_internal_exception,
@@ -65,19 +66,6 @@
 
     ThreadId = str
 
-    # The exact value of this id is not very meaningful. The purpose
-    # of this id is to give us a compact and unique identifier for a
-    # raw stack that can be used as a key to a dictionary so that it
-    # can be used during the sampled format generation.
-    RawStackId = Tuple[int, int]
-
-    RawFrame = Tuple[
-        str,  # abs_path
-        int,  # lineno
-    ]
-    RawStack = Tuple[RawFrame, ...]
-    RawSample = Sequence[Tuple[str, Tuple[RawStackId, RawStack, Deque[FrameType]]]]
-
     ProcessedSample = TypedDict(
         "ProcessedSample",
         {
@@ -120,6 +108,21 @@
         {"profile_id": str},
     )
 
+    FrameId = Tuple[
+        str,  # abs_path
+        int,  # lineno
+    ]
+    FrameIds = Tuple[FrameId, ...]
+
+    # The exact value of this id is not very meaningful. The purpose
+    # of this id is to give us a compact and unique identifier for a
+    # raw stack that can be used as a key to a dictionary so that it
+    # can be used during the sampled format generation.
+    StackId = Tuple[int, int]
+
+    ExtractedStack = Tuple[StackId, FrameIds, List[ProcessedFrame]]
+    ExtractedSample = Sequence[Tuple[ThreadId, ExtractedStack]]
+
 
 try:
     from gevent import get_hub as get_gevent_hub  # type: ignore
@@ -244,12 +247,16 @@ def teardown_profiler():
 MAX_STACK_DEPTH = 128
 
 
+CWD = os.getcwd()
+
+
 def extract_stack(
-    frame,  # type: Optional[FrameType]
-    prev_cache=None,  # type: Optional[Tuple[RawStackId, RawStack, Deque[FrameType]]]
+    raw_frame,  # type: Optional[FrameType]
+    cache,  # type: LRUCache
+    cwd=CWD,  # type: str
     max_stack_depth=MAX_STACK_DEPTH,  # type: int
 ):
-    # type: (...) -> Tuple[RawStackId, RawStack, Deque[FrameType]]
+    # type: (...) -> ExtractedStack
     """
     Extracts the stack starting the specified frame. The extracted stack
     assumes the specified frame is the top of the stack, and works back
@@ -259,31 +266,21 @@ def extract_stack(
     only the first `MAX_STACK_DEPTH` frames will be returned.
     """
 
-    frames = deque(maxlen=max_stack_depth)  # type: Deque[FrameType]
+    raw_frames = deque(maxlen=max_stack_depth)  # type: Deque[FrameType]
 
-    while frame is not None:
-        f_back = frame.f_back
-        frames.append(frame)
-        frame = f_back
+    while raw_frame is not None:
+        f_back = raw_frame.f_back
+        raw_frames.append(raw_frame)
+        raw_frame = f_back
 
-    if prev_cache is None:
-        stack = tuple(frame_key(frame) for frame in frames)
-    else:
-        _, prev_stack, prev_frames = prev_cache
-        prev_depth = len(prev_frames)
-        depth = len(frames)
-
-        # We want to match the frame found in this sample to the frames found in the
-        # previous sample. If they are the same (using the `is` operator), we can
-        # skip the expensive work of extracting the frame information and reuse what
-        # we extracted during the last sample.
-        #
-        # Make sure to keep in mind that the stack is ordered from the inner most
-        # from to the outer most frame so be careful with the indexing.
-        stack = tuple(
-            prev_stack[i] if i >= 0 and frame is prev_frames[i] else frame_key(frame)
-            for i, frame in zip(range(prev_depth - depth, prev_depth), frames)
-        )
+    frame_ids = tuple(frame_id(raw_frame) for raw_frame in raw_frames)
+    frames = []
+    for i, fid in enumerate(frame_ids):
+        frame = cache.get(fid)
+        if frame is None:
+            frame = extract_frame(raw_frames[i], cwd)
+            cache.set(fid, frame)
+        frames.append(frame)
 
     # Instead of mapping the stack into frame ids and hashing
     # that as a tuple, we can directly hash the stack.
@@ -296,14 +293,14 @@ def extract_stack(
     # To Reduce the likelihood of hash collisions, we include
     # the stack depth. This means that only stacks of the same
     # depth can suffer from hash collisions.
-    stack_id = len(stack), hash(stack)
+    stack_id = len(raw_frames), hash(frame_ids)
 
-    return stack_id, stack, frames
+    return stack_id, frame_ids, frames
 
 
-def frame_key(frame):
-    # type: (FrameType) -> RawFrame
-    return (frame.f_code.co_filename, frame.f_lineno)
+def frame_id(raw_frame):
+    # type: (FrameType) -> FrameId
+    return (raw_frame.f_code.co_filename, raw_frame.f_lineno)
 
 
 def extract_frame(frame, cwd):
@@ -472,8 +469,8 @@ def __init__(
         self.stop_ns = 0  # type: int
         self.active = False  # type: bool
 
-        self.indexed_frames = {}  # type: Dict[RawFrame, int]
-        self.indexed_stacks = {}  # type: Dict[RawStackId, int]
+        self.indexed_frames = {}  # type: Dict[FrameId, int]
+        self.indexed_stacks = {}  # type: Dict[StackId, int]
         self.frames = []  # type: List[ProcessedFrame]
         self.stacks = []  # type: List[ProcessedStack]
         self.samples = []  # type: List[ProcessedSample]
@@ -613,8 +610,8 @@ def __exit__(self, ty, value, tb):
 
         scope.profile = old_profile
 
-    def write(self, cwd, ts, sample, frame_cache):
-        # type: (str, int, RawSample, Dict[RawFrame, ProcessedFrame]) -> None
+    def write(self, ts, sample):
+        # type: (int, ExtractedSample) -> None
         if not self.active:
             return
 
@@ -630,23 +627,19 @@ def write(self, cwd, ts, sample, frame_cache):
 
         elapsed_since_start_ns = str(offset)
 
-        for tid, (stack_id, raw_stack, frames) in sample:
+        for tid, (stack_id, frame_ids, frames) in sample:
             try:
                 # Check if the stack is indexed first, this lets us skip
                 # indexing frames if it's not necessary
                 if stack_id not in self.indexed_stacks:
-                    for i, raw_frame in enumerate(raw_stack):
-                        if raw_frame not in self.indexed_frames:
-                            self.indexed_frames[raw_frame] = len(self.indexed_frames)
-                            processed_frame = frame_cache.get(raw_frame)
-                            if processed_frame is None:
-                                processed_frame = extract_frame(frames[i], cwd)
-                                frame_cache[raw_frame] = processed_frame
-                            self.frames.append(processed_frame)
+                    for i, frame_id in enumerate(frame_ids):
+                        if frame_id not in self.indexed_frames:
+                            self.indexed_frames[frame_id] = len(self.indexed_frames)
+                            self.frames.append(frames[i])
 
                     self.indexed_stacks[stack_id] = len(self.indexed_stacks)
                     self.stacks.append(
-                        [self.indexed_frames[raw_frame] for raw_frame in raw_stack]
+                        [self.indexed_frames[frame_id] for frame_id in frame_ids]
                     )
 
                 self.samples.append(
@@ -791,12 +784,7 @@ def make_sampler(self):
         # type: () -> Callable[..., None]
         cwd = os.getcwd()
 
-        # In Python3+, we can use the `nonlocal` keyword to rebind the value,
-        # but this is not possible in Python2. To get around this, we wrap
-        # the value in a list to allow updating this value each sample.
-        last_sample = [
-            {}
-        ]  # type: List[Dict[int, Tuple[RawStackId, RawStack, Deque[FrameType]]]]
+        cache = LRUCache(max_size=256)
 
         def _sample_stack(*args, **kwargs):
             # type: (*Any, **Any) -> None
@@ -808,7 +796,6 @@ def _sample_stack(*args, **kwargs):
             if not self.new_profiles and not self.active_profiles:
                 # make sure to clear the cache if we're not profiling so we dont
                 # keep a reference to the last stack of frames around
-                last_sample[0] = {}
                 return
 
             # This is the number of profiles we want to pop off.
@@ -824,27 +811,16 @@ def _sample_stack(*args, **kwargs):
             now = nanosecond_time()
 
             try:
-                raw_sample = {
-                    tid: extract_stack(frame, last_sample[0].get(tid))
+                sample = [
+                    (str(tid), extract_stack(frame, cache, cwd))
                     for tid, frame in sys._current_frames().items()
-                }
+                ]
             except AttributeError:
                 # For some reason, the frame we get doesn't have certain attributes.
                 # When this happens, we abandon the current sample as it's bad.
                 capture_internal_exception(sys.exc_info())
-
-                # make sure to clear the cache if something went wrong when extracting
-                # the stack so we dont keep a reference to the last stack of frames around
-                last_sample[0] = {}
-
                 return
 
-            # make sure to update the last sample so the cache has
-            # the most recent stack for better cache hits
-            last_sample[0] = raw_sample
-
-            sample = [(str(tid), data) for tid, data in raw_sample.items()]
-
             # Move the new profiles into the active_profiles set.
             #
             # We cannot directly add the to active_profiles set
@@ -860,11 +836,9 @@ def _sample_stack(*args, **kwargs):
 
             inactive_profiles = []
 
-            frame_cache = {}  # type: Dict[RawFrame, ProcessedFrame]
-
             for profile in self.active_profiles:
                 if profile.active:
-                    profile.write(cwd, now, sample, frame_cache)
+                    profile.write(now, sample)
                 else:
                     # If a thread is marked inactive, we buffer it
                     # to `inactive_profiles` so it can be removed.
diff --git a/tests/test_lru_cache.py b/tests/test_lru_cache.py
new file mode 100644
index 0000000000..5343e76169
--- /dev/null
+++ b/tests/test_lru_cache.py
@@ -0,0 +1,37 @@
+import pytest
+
+from sentry_sdk._lru_cache import LRUCache
+
+
+@pytest.mark.parametrize("max_size", [-10, -1, 0])
+def test_illegal_size(max_size):
+    with pytest.raises(AssertionError):
+        LRUCache(max_size=max_size)
+
+
+def test_simple_set_get():
+    cache = LRUCache(1)
+    assert cache.get(1) is None
+    cache.set(1, 1)
+    assert cache.get(1) == 1
+
+
+def test_overwrite():
+    cache = LRUCache(1)
+    assert cache.get(1) is None
+    cache.set(1, 1)
+    assert cache.get(1) == 1
+    cache.set(1, 2)
+    assert cache.get(1) == 2
+
+
+def test_cache_eviction():
+    cache = LRUCache(3)
+    cache.set(1, 1)
+    cache.set(2, 2)
+    cache.set(3, 3)
+    assert cache.get(1) == 1
+    assert cache.get(2) == 2
+    cache.set(4, 4)
+    assert cache.get(3) is None
+    assert cache.get(4) == 4
diff --git a/tests/test_profiler.py b/tests/test_profiler.py
index b0e8925be4..11ece9821e 100644
--- a/tests/test_profiler.py
+++ b/tests/test_profiler.py
@@ -20,6 +20,7 @@
     setup_profiler,
 )
 from sentry_sdk.tracing import Transaction
+from sentry_sdk._lru_cache import LRUCache
 from sentry_sdk._queue import Queue
 
 try:
@@ -472,35 +473,40 @@ def test_extract_stack_with_max_depth(depth, max_stack_depth, actual_depth):
 
     # increase the max_depth by the `base_stack_depth` to account
     # for the extra frames pytest will add
-    _, stack, frames = extract_stack(
-        frame, max_stack_depth=max_stack_depth + base_stack_depth
+    _, frame_ids, frames = extract_stack(
+        frame, LRUCache(max_size=1), max_stack_depth=max_stack_depth + base_stack_depth
     )
-    assert len(stack) == base_stack_depth + actual_depth
+    assert len(frame_ids) == base_stack_depth + actual_depth
     assert len(frames) == base_stack_depth + actual_depth
 
     for i in range(actual_depth):
-        assert get_frame_name(frames[i]) == "get_frame", i
+        assert frames[i]["function"] == "get_frame", i
 
     # index 0 contains the inner most frame on the stack, so the lamdba
     # should be at index `actual_depth`
     if sys.version_info >= (3, 11):
         assert (
-            get_frame_name(frames[actual_depth])
+            frames[actual_depth]["function"]
             == "test_extract_stack_with_max_depth.<locals>.<lambda>"
         ), actual_depth
     else:
-        assert get_frame_name(frames[actual_depth]) == "<lambda>", actual_depth
+        assert frames[actual_depth]["function"] == "<lambda>", actual_depth
 
 
-def test_extract_stack_with_cache():
-    frame = get_frame(depth=1)
-
-    prev_cache = extract_stack(frame)
-    _, stack1, _ = prev_cache
-    _, stack2, _ = extract_stack(frame, prev_cache)
-
-    assert len(stack1) == len(stack2)
-    for i, (frame1, frame2) in enumerate(zip(stack1, stack2)):
+@pytest.mark.parametrize(
+    ("frame", "depth"),
+    [(get_frame(depth=1), len(inspect.stack()))],
+)
+def test_extract_stack_with_cache(frame, depth):
+    # make sure cache has enough room or this test will fail
+    cache = LRUCache(max_size=depth)
+    _, _, frames1 = extract_stack(frame, cache)
+    _, _, frames2 = extract_stack(frame, cache)
+
+    assert len(frames1) > 0
+    assert len(frames2) > 0
+    assert len(frames1) == len(frames2)
+    for i, (frame1, frame2) in enumerate(zip(frames1, frames2)):
         # DO NOT use `==` for the assertion here since we are
         # testing for identity, and using `==` would test for
         # equality which would always pass since we're extract
@@ -629,9 +635,7 @@ def test_thread_scheduler_single_background_thread(scheduler_class):
 )
 @mock.patch("sentry_sdk.profiler.MAX_PROFILE_DURATION_NS", 1)
 def test_max_profile_duration_reached(scheduler_class):
-    sample = [("1", extract_stack(get_frame()))]
-
-    cwd = os.getcwd()
+    sample = [("1", extract_stack(get_frame(), LRUCache(max_size=1)))]
 
     with scheduler_class(frequency=1000) as scheduler:
         transaction = Transaction(sampled=True)
@@ -640,15 +644,15 @@ def test_max_profile_duration_reached(scheduler_class):
             assert profile.active
 
             # write a sample at the start time, so still active
-            profile.write(cwd, profile.start_ns + 0, sample, {})
+            profile.write(profile.start_ns + 0, sample)
             assert profile.active
 
             # write a sample at max time, so still active
-            profile.write(cwd, profile.start_ns + 1, sample, {})
+            profile.write(profile.start_ns + 1, sample)
             assert profile.active
 
             # write a sample PAST the max time, so now inactive
-            profile.write(cwd, profile.start_ns + 2, sample, {})
+            profile.write(profile.start_ns + 2, sample)
             assert not profile.active
 
 
@@ -675,8 +679,8 @@ def ensure_running(self):
 
 
 sample_stacks = [
-    extract_stack(get_frame(), max_stack_depth=1),
-    extract_stack(get_frame(), max_stack_depth=2),
+    extract_stack(get_frame(), LRUCache(max_size=1), max_stack_depth=1),
+    extract_stack(get_frame(), LRUCache(max_size=1), max_stack_depth=2),
 ]
 
 
@@ -706,7 +710,7 @@ def ensure_running(self):
         pytest.param(
             [(0, [("1", sample_stacks[0])])],
             {
-                "frames": [extract_frame(sample_stacks[0][2][0], os.getcwd())],
+                "frames": [sample_stacks[0][2][0]],
                 "samples": [
                     {
                         "elapsed_since_start_ns": "0",
@@ -725,7 +729,7 @@ def ensure_running(self):
                 (1, [("1", sample_stacks[0])]),
             ],
             {
-                "frames": [extract_frame(sample_stacks[0][2][0], os.getcwd())],
+                "frames": [sample_stacks[0][2][0]],
                 "samples": [
                     {
                         "elapsed_since_start_ns": "0",
@@ -750,8 +754,8 @@ def ensure_running(self):
             ],
             {
                 "frames": [
-                    extract_frame(sample_stacks[0][2][0], os.getcwd()),
-                    extract_frame(sample_stacks[1][2][0], os.getcwd()),
+                    sample_stacks[0][2][0],
+                    sample_stacks[1][2][0],
                 ],
                 "samples": [
                     {
@@ -785,7 +789,7 @@ def test_profile_processing(
                 # force the sample to be written at a time relative to the
                 # start of the profile
                 now = profile.start_ns + ts
-                profile.write(os.getcwd(), now, sample, {})
+                profile.write(now, sample)
 
             processed = profile.process()