From 1b2e9ce28dad09c0cbab2d94265deda83d25377f Mon Sep 17 00:00:00 2001 From: Tim McCormack Date: Tue, 6 Aug 2024 21:45:08 +0000 Subject: [PATCH] feat: Log all span-starts for the first minute Datadog Support has asked us to log all span-starts. I'm hoping that they can get what they need if we just log the first minute of spans, since the problem seems to be related to some of the first requests after deploy or restart. Excluding function-docstring linting for the span processor, since those are essentially overrides. (pylint only started caring once one of the methods was more than 4 lines long.) --- CHANGELOG.rst | 6 ++++ edx_arch_experiments/__init__.py | 2 +- .../datadog_diagnostics/apps.py | 20 +++++++++++++ .../datadog_diagnostics/tests/test_app.py | 29 +++++++++++++++++-- 4 files changed, 54 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 6e5207d..e846f3f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,6 +14,12 @@ Change Log Unreleased ~~~~~~~~~~ +[4.1.0] - 2024-08-07 +~~~~~~~~~~~~~~~~~~~~ +Changed +------- +* Datadog diagnostics will now log all span-starts for the first minute after server startup + [4.0.0] - 2024-08-05 ~~~~~~~~~~~~~~~~~~~~ Changed diff --git a/edx_arch_experiments/__init__.py b/edx_arch_experiments/__init__.py index a00a1e5..bb3e7ee 100644 --- a/edx_arch_experiments/__init__.py +++ b/edx_arch_experiments/__init__.py @@ -2,4 +2,4 @@ A plugin to include applications under development by the architecture team at 2U. """ -__version__ = '4.0.0' +__version__ = '4.1.0' diff --git a/edx_arch_experiments/datadog_diagnostics/apps.py b/edx_arch_experiments/datadog_diagnostics/apps.py index de29169..3ca9401 100644 --- a/edx_arch_experiments/datadog_diagnostics/apps.py +++ b/edx_arch_experiments/datadog_diagnostics/apps.py @@ -3,6 +3,7 @@ """ import logging +import time from django.apps import AppConfig from django.conf import settings @@ -26,7 +27,18 @@ # avoids logging more data than is actually needed for diagnosis. DATADOG_DIAGNOSTICS_MAX_SPANS = getattr(settings, 'DATADOG_DIAGNOSTICS_MAX_SPANS', 100) +# .. setting_name: DATADOG_DIAGNOSTICS_LOG_ALL_SPAN_STARTS_PERIOD +# .. setting_default: 60 +# .. setting_description: Log all span starts for this many seconds after worker +# startup. +DATADOG_DIAGNOSTICS_LOG_ALL_SPAN_STARTS_PERIOD = getattr( + settings, + 'DATADOG_DIAGNOSTICS_LOG_ALL_SPAN_STARTS_PERIOD', + 60 +) + +# pylint: disable=missing-function-docstring class MissingSpanProcessor: """Datadog span processor that logs unfinished spans at shutdown.""" @@ -34,12 +46,20 @@ def __init__(self): self.spans_started = 0 self.spans_finished = 0 self.open_spans = {} + self.log_all_until = time.time() + DATADOG_DIAGNOSTICS_LOG_ALL_SPAN_STARTS_PERIOD def on_span_start(self, span): self.spans_started += 1 if len(self.open_spans) < DATADOG_DIAGNOSTICS_MAX_SPANS: self.open_spans[span.span_id] = span + # We believe that the anomalous traces always come from a + # single span that is created early in the lifetime of a + # gunicorn worker. If we log *every* span-start in this early + # period, we may be able to observe something interesting. + if time.time() <= self.log_all_until: + log.info(f"Early span-start sample: {span._pprint()}") # pylint: disable=protected-access + def on_span_finish(self, span): self.spans_finished += 1 self.open_spans.pop(span.span_id, None) # "delete if present" diff --git a/edx_arch_experiments/datadog_diagnostics/tests/test_app.py b/edx_arch_experiments/datadog_diagnostics/tests/test_app.py index b0ce611..b927499 100644 --- a/edx_arch_experiments/datadog_diagnostics/tests/test_app.py +++ b/edx_arch_experiments/datadog_diagnostics/tests/test_app.py @@ -2,7 +2,7 @@ Tests for plugin app. """ -from unittest.mock import patch +from unittest.mock import call, patch from django.test import TestCase @@ -47,5 +47,30 @@ def test_logging(self, mock_log_error, mock_log_info): proc.on_span_start(FakeSpan(17)) proc.shutdown(0) - mock_log_info.assert_called_once_with("Spans created = 1; spans finished = 0") + assert mock_log_info.call_args_list == [ + call("Early span-start sample: span_id=17"), + call("Spans created = 1; spans finished = 0"), + ] mock_log_error.assert_called_once_with("Span created but not finished: span_id=17") + + @patch('edx_arch_experiments.datadog_diagnostics.apps.log.info') + def test_early_span_logging_cutoff(self, mock_log_info): + with patch('edx_arch_experiments.datadog_diagnostics.apps.time.time', side_effect=[ + # Setup + 1700000000, + # Span-start time checks + 1700000020, + 1700000040, + 1700010000, + ]): + proc = apps.MissingSpanProcessor() + # Three spans are started + proc.on_span_start(FakeSpan(44)) + proc.on_span_start(FakeSpan(45)) + proc.on_span_start(FakeSpan(46)) + + # Just two early span-starts are logged + assert mock_log_info.call_args_list == [ + call("Early span-start sample: span_id=44"), + call("Early span-start sample: span_id=45"), + ]