diff --git a/CHANGELOG.md b/CHANGELOG.md index 97a73fac29..a8b9eb9aac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ - Move `process-event` from the Relay CLI into a standalone tool. ([#740](//github.com/getsentry/relay/pull/740)) - Add the client SDK to session kafka payloads. ([#751](https://github.com/getsentry/relay/pull/751)) - Add a standalone tool to document metrics in JSON or YAML. ([#752](https://github.com/getsentry/relay/pull/752)) +- Emit `processing.event.produced` for user report and session Kafka messages. ([#757](https://github.com/getsentry/relay/pull/757)) ## 20.8.0 diff --git a/relay-server/src/actors/store.rs b/relay-server/src/actors/store.rs index 3113691fad..f507a2b38e 100644 --- a/relay-server/src/actors/store.rs +++ b/relay-server/src/actors/store.rs @@ -441,6 +441,10 @@ impl Handler for StoreForwarder { start_time, item, )?; + metric!( + counter(RelayCounters::ProcessingMessageProduced) += 1, + event_type = "user_report" + ); } ItemType::Session => { self.produce_session( @@ -450,6 +454,10 @@ impl Handler for StoreForwarder { client, item, )?; + metric!( + counter(RelayCounters::ProcessingMessageProduced) += 1, + event_type = "session" + ); } _ => {} } diff --git a/relay-server/src/metrics.rs b/relay-server/src/metrics.rs index ec6e0f2390..de64359588 100644 --- a/relay-server/src/metrics.rs +++ b/relay-server/src/metrics.rs @@ -19,45 +19,91 @@ pub enum RelayHistograms { /// The number of envelopes in the queue as a percentage of the maximum number of envelopes that /// can be stored in the queue. /// - /// The value ranges from `0` (the queue is empty) to `1` (the queue is full and no additional - /// events can be added). + /// The value ranges from `0` when the queue is empty to `1` when the queue is full and no + /// additional events can be added. The queue size can be configured using `event.queue_size`. EnvelopeQueueSizePct, /// The number of envelopes in the queue. /// - /// The event queue represents the envelopes that are being processed at a particular time in - /// Relay. Once a request is received, the envelope receives some preliminary (quick) processing - /// to determine if it can be processed or it is rejected. Once this determination has been - /// done, the http request that created the envelope terminates and, if the request is to be - /// further processed, the envelope enters a queue. + /// The queue holds all envelopes that are being processed at a particular time in Relay: /// - /// Once the envelope finishes processing and is sent downstream, the envelope is considered - /// handled and it leaves the queue. + /// - When Relay receives a Request, it ensures the submitted data is wrapped in a single + /// envelope. + /// - The envelope receives some preliminary processing to determine if it can be processed or + /// if it must be rejected. + /// - Once this determination has been made, the HTTP request that created the envelope + /// terminates and, if the request is to be further processed, the envelope enters a queue. + /// - After the envelope finishes processing and is sent upstream, the envelope is considered + /// handled and it leaves the queue. + /// + /// The queue size can be configured with `cache.event_buffer_size`. EnvelopeQueueSize, - /// The size of the request body as seen by Relay after it is extracted from a request. + /// The size of the HTTP request body as seen by Relay after it is extracted from a request in + /// bytes. /// - /// For envelope requests, this is the full size of the envelope. For JSON store requests, this - /// is the size of the JSON body. + /// - For envelope requests, this is the full size of the envelope. + /// - For JSON store requests, this is the size of the JSON body. + /// - For multipart uploads of crash reports and attachments, this is the size of the multipart + /// body including boundaries. /// /// If this request contains a base64 zlib compressed payload without a proper /// `content-encoding` header, then this is the size before decompression. + /// + /// The maximum request body size can be configured with `limits.max_envelope_size`. RequestSizeBytesRaw, - /// The size of the request body as seen by Relay after it has been decompressed and decoded in - /// case this request contains a base64 zlib compressed payload without a proper - /// `content-encoding` header. Otherwise, this metric is always equal to `event.size_bytes.raw`. + /// The size of the request body as seen by Relay after decompression and decoding in bytes. + /// + /// JSON store requests may contain a base64 zlib compressed payload without proper + /// `content-encoding` header. In this case, this metric contains the size after decoding. + /// Otherwise, it is always equal to `event.size_bytes.raw`. RequestSizeBytesUncompressed, /// Number of projects in the in-memory project cache that are waiting for their state to be /// updated. + /// + /// See `project_cache.size` for more description of the project cache. ProjectStatePending, - /// Number of project states requested from the Upstream for the current batch request. + /// Number of project states **requested** from the upstream for each batch request. + /// + /// If multiple batches are updated concurrently, this metric is reported multiple times. + /// + /// The batch size can be configured with `cache.batch_size`. See `project_cache.size` for more + /// description of the project cache. ProjectStateRequestBatchSize, - /// Number of project states received from the Upstream for the current batch request. + /// Number of project states **returned** from the upstream for each batch request. + /// + /// If multiple batches are updated concurrently, this metric is reported multiple times. + /// + /// See `project_cache.size` for more description of the project cache. ProjectStateReceived, /// Number of project states currently held in the in-memory project cache. + /// + /// The cache duration for project states can be configured with the following options: + /// + /// - `cache.project_expiry`: The time after which a project state counts as expired. It is + /// automatically refreshed if a request references the project after it has expired. + /// - `cache.project_grace_period`: The time after expiry at which the project state will still + /// be used to ingest events. Once the grace period expires, the cache is evicted and new + /// requests wait for an update. + /// + /// There is no limit to the number of cached projects. ProjectStateCacheSize, /// The number of upstream requests queued up for a connection in the connection pool. + /// + /// Relay uses explicit queueing for most requests. This wait queue should almost always be + /// empty, and a large number of queued requests indicates a severe bug. ConnectorWaitQueue, - /// Number of messages queued by the Upstream actor and waiting to be sent over http. - /// This metric is tagged with a priority label (for high and low priority queues). + /// The number of upstream requests queued up for sending. + /// + /// Relay employs connection keep-alive whenever possible. Connections are kept open for _15_ + /// seconds of inactivity or _75_ seconds of activity. If all connections are busy, they are + /// queued, which is reflected in this metric. + /// + /// This metric is tagged with: + /// - `priority`: The queueing priority of the request, either `"high"` or `"low"`. The + /// priority determines precedence in executing requests. + /// + /// The number of concurrent connections can be configured with: + /// - `limits.max_concurrent_requests` for the overall number of connections + /// - `limits.max_concurrent_queries` for the number of concurrent high-priority requests UpstreamMessageQueueSize, } @@ -80,24 +126,25 @@ impl HistogramMetric for RelayHistograms { /// Timer metrics used by Relay pub enum RelayTimers { - /// The time spent deserializing an event from a JSON byte array into the native data structure - /// on which Relay operates. + /// Time in milliseconds spent deserializing an event from JSON bytes into the native data + /// structure on which Relay operates. EventProcessingDeserialize, - /// Time spent running event processors on an event. Event processing happens before filtering. + /// Time in milliseconds spent running event processors on an event for normalization. Event + /// processing happens before filtering. #[cfg(feature = "processing")] EventProcessingProcess, - /// Time spent running filtering on an event. + /// Time in milliseconds spent running inbound data filters on an event. #[cfg(feature = "processing")] EventProcessingFiltering, - /// Time spent checking for rate limits in Redis. + /// Time in milliseconds spent checking for organization, project, and DSN rate limits. /// - /// Note that not all events are checked against Redis. After an event is rate limited for the - /// first time, the rate limit is cached. Events coming in during this period will be discarded - /// earlier in the request queue and do not reach the processing queue. + /// Not all events reach this point. After an event is rate limited for the first time, the rate + /// limit is cached. Events coming in after this will be discarded earlier in the request queue + /// and do not reach the processing queue. #[cfg(feature = "processing")] EventProcessingRateLimiting, - /// Time spent in data scrubbing for the current event. Data scrubbing happens last before - /// serializing the event back to JSON. + /// Time in milliseconds spent in data scrubbing for the current event. Data scrubbing happens + /// last before serializing the event back to JSON. EventProcessingPii, /// Time spent converting the event from its in-memory reprsentation into a JSON string. EventProcessingSerialization, @@ -105,7 +152,7 @@ pub enum RelayTimers { /// the start of synchronous processing in the EventProcessor. This metric primarily indicates /// backlog in event processing. EnvelopeWaitTime, - /// The time spent in synchronous processing of envelopes. + /// Time in milliseconds spent in synchronous processing of envelopes. /// /// This timing covers the end-to-end processing in the CPU pool and comprises: /// @@ -113,36 +160,46 @@ pub enum RelayTimers { /// - `event_processing.pii` /// - `event_processing.serialization` /// - /// With Relay in processing mode, this includes the following additional timings: + /// With Relay in processing mode, this also includes the following timings: /// /// - `event_processing.process` /// - `event_processing.filtering` /// - `event_processing.rate_limiting` EnvelopeProcessingTime, - /// The total time an envelope spends in Relay from the time it is received until it finishes - /// processing and has been submitted. + /// Total time in milliseconds an envelope spends in Relay from the time it is received until it + /// finishes processing and has been submitted to the upstream. EnvelopeTotalTime, - /// The total time spent during `ProjectCache.fetch_states` in which eviction of outdated - /// projects happens. + /// Total time in milliseconds spent evicting outdated and unused projects happens. ProjectStateEvictionDuration, - /// The total time spent during `ProjectCache.fetch_states` spent waiting for all ProjectState - /// requests to resolve. During a fetch_states request, we pick up to max_num_requests * - /// max_num_project_states_per_request projects that need their state updated and batch - /// them into max_num_requests requests. This metric represents the time spent from issuing - /// the first request until all requests are finished. + /// Total time in milliseconds spent fetching queued project configuration updates requests to + /// resolve. + /// + /// Relay updates projects in batches. Every update cycle, Relay requests + /// `limits.max_concurrent_queries * cache.batch_size` projects from the upstream. This metric + /// measures the wall clock time for all concurrent requests in this loop. + /// + /// Note that after an update loop has completed, there may be more projects pending updates. + /// This is indicated by `project_state.pending`. ProjectStateRequestDuration, - /// The total time spent getting the project id from upstream. - /// **Note** that ProjectIdRequests happen only for the legacy - /// endpoint that does not specify the project id in the url, for the new endpoints the - /// project id is extracted from the url path. Only projects with the id not already fetched - /// are counted. - /// The project id is only fetched once and it is not refreshed. + /// Total time in milliseconds spent fetching a project ID from upstream. + /// + /// Relay resolves the Sentry project ID for requests sent to the legacy `/api/store/` endpoint. + /// This process blocks the downstream request handler. After this, the project ID is cached + /// until Relay restarts. + /// + /// Note that official Sentry SDKs do not send data to this endpoint. ProjectIdRequestDuration, - /// The total duration of a request as seen from Relay from the moment the request is - /// received until a http result is returned. Note that this does **not** represent the - /// total duration for processing an event. Requests for events that are not immediately - /// rejected ( because the project has hit a rate limit) are scheduled for processing at - /// a latter time and an HTTP OK (200) is returned. + /// Total duration in milliseconds for handling inbound web requests until the HTTP response is + /// returned to the client. + /// + /// This does **not** correspond to the full event ingestion time. Requests for events that are + /// not immediately rejected due to bad data or cached rate limits always return `200 OK`. Full + /// validation and normalization occur asynchronously, which is reported by + /// `event.processing_time`. + /// + /// This metric is tagged with: + /// - `method`: The HTTP method of the request. + /// - `route`: Unique dashed identifier of the endpoint. RequestsDuration, } @@ -171,92 +228,147 @@ impl TimerMetric for RelayTimers { /// Counter metrics used by Relay pub enum RelayCounters { - /// Number of envelopes accepted in the current time slot. This represents requests that have - /// successfully passed rate limits, filters and have been successfully handled. + /// Number of envelopes accepted in the current time slot. + /// + /// This represents requests that have successfully passed rate limits and filters, and have + /// been sent to the upstream. EnvelopeAccepted, - /// Number of envelopes rejected in the current time slot. This includes envelopes being - /// rejected because they are malformed or any other errors during processing (including - /// filtered events, invalid payloads and rate limits). + /// Number of envelopes rejected in the current time slot. + /// + /// This includes envelopes being rejected because they are malformed or any other errors during + /// processing (including filtered events, invalid payloads, and rate limits). + /// + /// To check the rejection reason, check `events.outcomes`, instead. EnvelopeRejected, - /// Represents a group of counters incremented for every outcome emitted by Relay, implemented - /// with tags. The following tags are present for each event outcome: + /// Number of outcomes and reasons for rejected Envelopes. + /// + /// This metric is tagged with: + /// - `outcome`: The basic cause for rejecting the event. + /// - `reason`: A more detailed identifier describing the rule or mechanism leading to the + /// outcome. /// - /// - `outcome` which is an `Outcome` enumeration - /// - `reason` which is the reason string for all outcomes that are not `Accepted`. + /// Possible outcomes are: + /// - `filtered`: Dropped by inbound data filters. The reason specifies the filter that + /// matched. + /// - `rate_limited`: Dropped by organization, project, or DSN rate limit, as well as exceeding + /// the Sentry plan quota. The reason contains the rate limit or quota that was exceeded. + /// - `invalid`: Data was considered invalid and could not be recovered. The reason indicates + /// the validation that failed. #[cfg(feature = "processing")] Outcomes, - /// Counts the number of times a project state lookup is done. This includes requests - /// for projects that are cached and requests for projects that are not yet cached. - /// All requests that return a `EventAction::Accept` i.e. are not rate limited (on - /// the fast path) or are discarded because we know the project is disabled or invalid - /// will be counted. + /// Number of times a project state is looked up from the cache. + /// + /// This includes lookups for both cached and new projects. As part of this, updates for + /// outdated or expired project caches are triggered. + /// + /// Related metrics: + /// - `project_cache.hit`: For successful cache lookups, even for outdated projects. + /// - `project_cache.miss`: For failed lookups resulting in an update. ProjectStateGet, - /// Counts the number of project state http requests. Note that a project state HTTP request - /// typically contains a number of projects (the project state requests are batched). + /// Number of project state HTTP requests. + /// + /// Relay updates projects in batches. Every update cycle, Relay requests + /// `limits.max_concurrent_queries` batches of `cache.batch_size` projects from the upstream. + /// The duration of these requests is reported via `project_state.request.duration`. + /// + /// Note that after an update loop has completed, there may be more projects pending updates. + /// This is indicated by `project_state.pending`. ProjectStateRequest, - /// Counts the number of times a request for a project is already present, this effectively - /// represents the fraction of `project_state.get` that will **not** result in a ProjectState - /// request. + /// Number of times a project is looked up from the cache. + /// + /// The cache may contain and outdated or expired project state. In that case, the project state + /// is updated even after a cache hit. ProjectCacheHit, - /// Counts the number of times a request for a project is not already present. - /// `project_state.get` = `project_cache.miss` + `project_cache.hit`. - /// Requests that are generating a cache hit will be queued and batched and eventually will - /// generate a `project_state.request`. + /// Number of times a project lookup failed. + /// + /// A cache entry is created immediately and the project state requested from the upstream. ProjectCacheMiss, - /// Counts the number of requests for the ProjectId (the timing is tracked - /// by `project_id.request.duration`). Note that ProjectIdRequests happen only for the legacy - /// endpoint that does not specify the project id in the url, for the new endpoints the - /// project id is extracted from the url path. Only projects with the id not already fetched - /// are counted. Once the ProjectId is successfully cached it will be retained indefinitely. + /// Number of project ID HTTP requests. + /// + /// Relay resolves the Sentry project ID for requests sent to the legacy `/api/store/` endpoint. + /// This process blocks the downstream request handler. The timing for this request is reported + /// via `project_id.request.duration`. After this, the project ID is cached until Relay + /// restarts. + /// + /// Note that official Sentry SDKs do not send data to this endpoint. ProjectIdRequest, - /// Counts the number of times Relay started. + /// Number of Relay server starts. + /// /// This can be used to track unwanted restarts due to crashes or termination. ServerStarting, - /// Counts the number of messages placed on the Kafka queue. + /// Number of messages placed on the Kafka queues. + /// + /// When Relay operates as Sentry service and an Envelope item is successfully processed, each + /// Envelope item results in a dedicated message on one of the ingestion topics on Kafka. + /// + /// This metric is tagged with: + /// - `event_type`: The kind of message produced to Kafka. /// - /// When Relay operates with processing enabled and an item is successfully processed, each item - /// will generate a message on the Kafka. The counter has an `event_type` tag which is set to - /// either `event` or `attachment` representing the type of message produced on the Kafka queue. + /// The message types can be: + /// + /// - `event`: An error or transaction event. Error events are sent to `ingest-events`, + /// transactions to `ingest-transactions`, and errors with attachments are sent to + /// `ingest-attachments`. + /// - `attachment`: An attachment file associated with an error event, sent to + /// `ingest-attachments`. + /// - `user_report`: A message from the user feedback dialog, sent to `ingest-events`. + /// - `session`: A release health session update, sent to `ingest-sessions`. #[cfg(feature = "processing")] ProcessingMessageProduced, - /// Counts the number of producer errors occurred after an event was already enqueued for - /// sending to Kafka. These errors might include e.g. MessageTooLarge errors when the broker - /// does not accept the requests over a certain size, which is usually due to invalic or - /// inconsistent broker/producer configurations. + /// Number of producer errors occurred after an envelope was already enqueued for sending to + /// Kafka. + /// + /// These errors include, for example, _"MessageTooLarge"_ errors when the broker does not + /// accept the requests over a certain size, which is usually due to invalid or inconsistent + /// broker/producer configurations. #[cfg(feature = "processing")] ProcessingProduceError, - /// Counts the number of events that hit any of the Store like endpoints (Store, Security, - /// MiniDump, Unreal). The events are counted before they are rate limited , filtered or - /// processed in any way. The counter has a `version` tag that tracks the message event - /// protocol version. + /// Number of events that hit any of the store-like endpoints: Envelope, Store, Security, + /// Minidump, Unreal. + /// + /// The events are counted before they are rate limited, filtered, or processed in any way. + /// + /// This metric is tagged with: + /// - `version`: The event protocol version number defaulting to `7`. EventProtocol, - /// Counts the number of requests reaching Relay. + /// Number of HTTP requests reaching Relay. Requests, - /// Counts the number of requests that have finished during the current interval. - /// The counter has the following tags: + /// Number of completed HTTP requests. + /// + /// This metric is tagged with: /// - /// - `status_code` The HTTP status code number. - /// - `method` The HTTP method used in the request in uppercase. - /// - `route` Unique dashed identifier of the endpoint. + /// - `status_code`: The HTTP status code number. + /// - `method`: The HTTP method used in the request in uppercase. + /// - `route`: Unique dashed identifier of the endpoint. ResponsesStatusCodes, - /// We are scanning our in-memory project cache for stale entries. This counter is incremented - /// before doing the expensive operation. + /// Number of evicted stale projects from the cache. + /// + /// Relay scans the in-memory project cache for stale entries in a regular interval configured + /// by `cache.eviction_interval`. + /// + /// The cache duration for project states can be configured with the following options: + /// + /// - `cache.project_expiry`: The time after which a project state counts as expired. It is + /// automatically refreshed if a request references the project after it has expired. + /// - `cache.project_grace_period`: The time after expiry at which the project state will still + /// be used to ingest events. Once the grace period expires, the cache is evicted and new + /// requests wait for an update. EvictingStaleProjectCaches, - /// The number of requests that reused an already open upstream connection. + /// Number of requests that reused an already open upstream connection. /// - /// Relay employs connection keep-alive whenever possible. Connections are kept open for 15 - /// seconds of inactivity, or 75 seconds of activity. + /// Relay employs connection keep-alive whenever possible. Connections are kept open for _15_ + /// seconds of inactivity or _75_ seconds of activity. ConnectorReused, - /// The number of upstream connections opened. + /// Number of upstream connections opened. ConnectorOpened, - /// The number of upstream connections closed due to connection timeouts. + /// Number of upstream connections closed due to connection timeouts. /// - /// Relay employs connection keep-alive whenever possible. Connections are kept open for 15 - /// seconds of inactivity, or 75 seconds of activity. + /// Relay employs connection keep-alive whenever possible. Connections are kept open for _15_ + /// seconds of inactivity or _75_ seconds of activity. ConnectorClosed, - /// The number of upstream connections that experienced errors. + /// Number of upstream connections that experienced errors. ConnectorErrors, - /// The number of upstream connections that experienced a timeout. + /// Number of upstream connections that experienced a timeout. ConnectorTimeouts, }