diff --git a/build.gradle b/build.gradle index f66f3dc852..5e75849b46 100644 --- a/build.gradle +++ b/build.gradle @@ -54,6 +54,7 @@ def hadoopVersion = '2.10.2' def apacheSparkVersion = '3.3.3' def antlrVersion = '4.8' def scala = '2.12' +def openTelemetryVersion = '1.43.0' ext.libraries = [ alpnAgent: "org.mortbay.jetty.alpn:jetty-alpn-agent:${alpnAgentVersion}", @@ -141,6 +142,11 @@ ext.libraries = [ zkclient: 'com.101tec:zkclient:0.7', // For Kafka AdminUtils zookeeper: 'org.apache.zookeeper:zookeeper:3.6.3', zstd: 'com.github.luben:zstd-jni:1.5.2-3', + opentelemetryApi: "io.opentelemetry:opentelemetry-api:${openTelemetryVersion}", + opentelemetrySdk: "io.opentelemetry:opentelemetry-sdk:${openTelemetryVersion}", + opentelemetryExporterLogging: "io.opentelemetry:opentelemetry-exporter-logging:${openTelemetryVersion}", + opentelemetryExporterOtlp: "io.opentelemetry:opentelemetry-exporter-otlp:${openTelemetryVersion}", + opentelemetryExporterCommon: "io.opentelemetry:opentelemetry-exporter-common:${openTelemetryVersion}" ] group = 'com.linkedin.venice' diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/AggHostLevelIngestionStats.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/AggHostLevelIngestionStats.java index 463201f2b5..3752e8c8e8 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/AggHostLevelIngestionStats.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/AggHostLevelIngestionStats.java @@ -23,10 +23,12 @@ public AggHostLevelIngestionStats( boolean unregisterMetricForDeletedStoreEnabled, Time time) { super( + serverConfig.getClusterName(), metricsRepository, new HostLevelStoreIngestionStatsSupplier(serverConfig, ingestionTaskMap, time), metadataRepository, - unregisterMetricForDeletedStoreEnabled); + unregisterMetricForDeletedStoreEnabled, + false); } static class HostLevelStoreIngestionStatsSupplier implements StatsSupplier { @@ -44,7 +46,7 @@ static class HostLevelStoreIngestionStatsSupplier implements StatsSupplier { - public StorageEngineStatsReporter(MetricsRepository metricsRepository, String storeName) { + public StorageEngineStatsReporter(MetricsRepository metricsRepository, String storeName, String clusterName) { super(metricsRepository, storeName); } diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/DIVStatsReporter.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/DIVStatsReporter.java index 4432917658..d48b867588 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/DIVStatsReporter.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/DIVStatsReporter.java @@ -14,7 +14,7 @@ * collection/visualization system. */ public class DIVStatsReporter extends AbstractVeniceStatsReporter { - public DIVStatsReporter(MetricsRepository metricsRepository, String storeName) { + public DIVStatsReporter(MetricsRepository metricsRepository, String storeName, String clusterName) { super(metricsRepository, storeName); } diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/IngestionStatsReporter.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/IngestionStatsReporter.java index e88f39e5c9..7993eaa171 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/IngestionStatsReporter.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/IngestionStatsReporter.java @@ -60,7 +60,7 @@ public class IngestionStatsReporter extends AbstractVeniceStatsReporter { private static final Logger LOGGER = LogManager.getLogger(IngestionStatsReporter.class); - public IngestionStatsReporter(MetricsRepository metricsRepository, String storeName) { + public IngestionStatsReporter(MetricsRepository metricsRepository, String storeName, String clusterName) { super(metricsRepository, storeName); } diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/VeniceVersionedStatsReporter.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/VeniceVersionedStatsReporter.java index dd112be658..232b1c5860 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/VeniceVersionedStatsReporter.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/VeniceVersionedStatsReporter.java @@ -30,10 +30,10 @@ public VeniceVersionedStatsReporter( registerSensor("current_version", new AsyncGauge((ignored1, ignored2) -> currentVersion, "current_version")); registerSensor("future_version", new AsyncGauge((ignored1, ignored2) -> futureVersion, "future_version")); - this.currentStatsReporter = statsSupplier.get(metricsRepository, storeName + "_current"); + this.currentStatsReporter = statsSupplier.get(metricsRepository, storeName + "_current", (String) null); if (!isSystemStore) { - this.futureStatsReporter = statsSupplier.get(metricsRepository, storeName + "_future"); - this.totalStatsReporter = statsSupplier.get(metricsRepository, storeName + "_total"); + this.futureStatsReporter = statsSupplier.get(metricsRepository, storeName + "_future", (String) null); + this.totalStatsReporter = statsSupplier.get(metricsRepository, storeName + "_total", (String) null); } else { this.futureStatsReporter = null; this.totalStatsReporter = null; diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatMonitoringService.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatMonitoringService.java index 7fd9f177e5..5ce963a58f 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatMonitoringService.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatMonitoringService.java @@ -68,7 +68,10 @@ public HeartbeatMonitoringService( metricsRepository, metadataRepository, () -> new HeartbeatStat(new MetricConfig(), regionNames), - (aMetricsRepository, storeName) -> new HeartbeatStatReporter(aMetricsRepository, storeName, regionNames), + (aMetricsRepository, storeName, clusterName) -> new HeartbeatStatReporter( + aMetricsRepository, + storeName, + regionNames), leaderHeartbeatTimeStamps, followerHeartbeatTimeStamps); } diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/DIVStatsReporterTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/DIVStatsReporterTest.java index 1fd1632b5d..a5279bc6a9 100644 --- a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/DIVStatsReporterTest.java +++ b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/DIVStatsReporterTest.java @@ -19,7 +19,7 @@ public void testDIVReporterCanReport() { metricsRepository.addReporter(reporter); String storeName = Utils.getUniqueString("store"); - DIVStatsReporter divStatsReporter = new DIVStatsReporter(metricsRepository, storeName); + DIVStatsReporter divStatsReporter = new DIVStatsReporter(metricsRepository, storeName, null); assertEquals(reporter.query("." + storeName + "--success_msg.DIVStatsGauge").value(), (double) NULL_DIV_STATS.code); diff --git a/internal/venice-client-common/build.gradle b/internal/venice-client-common/build.gradle index 3ac9a15dc7..9d62bb9260 100644 --- a/internal/venice-client-common/build.gradle +++ b/internal/venice-client-common/build.gradle @@ -39,6 +39,11 @@ dependencies { implementation libraries.log4j2api implementation libraries.zstd implementation libraries.conscrypt + implementation libraries.opentelemetryApi + implementation libraries.opentelemetrySdk + implementation libraries.opentelemetryExporterLogging + implementation libraries.opentelemetryExporterOtlp + implementation libraries.opentelemetryExporterCommon testImplementation project(':internal:venice-test-common') testImplementation project(':clients:venice-thin-client') diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/AbstractVeniceAggStats.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/AbstractVeniceAggStats.java index 03a87a4bfe..82fb1905c3 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/AbstractVeniceAggStats.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/AbstractVeniceAggStats.java @@ -11,39 +11,56 @@ public abstract class AbstractVeniceAggStats { protected final Map storeStats = new VeniceConcurrentHashMap<>(); private StatsSupplier statsFactory; + private final MetricsRepository metricsRepository; + private final String clusterName; - private AbstractVeniceAggStats(MetricsRepository metricsRepository, StatsSupplier statsSupplier, T totalStats) { + private AbstractVeniceAggStats( + String clusterName, + MetricsRepository metricsRepository, + StatsSupplier statsSupplier, + T totalStats) { + this.clusterName = clusterName; this.metricsRepository = metricsRepository; this.statsFactory = statsSupplier; this.totalStats = totalStats; } - public AbstractVeniceAggStats(MetricsRepository metricsRepository, StatsSupplier statsSupplier) { - this(metricsRepository, statsSupplier, statsSupplier.get(metricsRepository, STORE_NAME_FOR_TOTAL_STAT, null)); - } - - public AbstractVeniceAggStats(MetricsRepository metricsRepository) { + public AbstractVeniceAggStats(String clusterName, MetricsRepository metricsRepository) { this.metricsRepository = metricsRepository; + this.clusterName = clusterName; } public void setStatsSupplier(StatsSupplier statsSupplier) { this.statsFactory = statsSupplier; - this.totalStats = statsSupplier.get(metricsRepository, STORE_NAME_FOR_TOTAL_STAT, null); + this.totalStats = statsSupplier.get(metricsRepository, STORE_NAME_FOR_TOTAL_STAT, clusterName, null); } + /** + * clusterName is used to create per cluster aggregate stats and {@link com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions}
+ * If perClusterAggregate is true, it will create per cluster aggregates with storeName as "total." + */ public AbstractVeniceAggStats( String clusterName, MetricsRepository metricsRepository, - StatsSupplier statsSupplier) { - this( + StatsSupplier statsSupplier, + boolean perClusterAggregate) { + if (perClusterAggregate && clusterName == null) { + throw new IllegalArgumentException("perClusterAggregate cannot be true when clusterName is null"); + } + this.clusterName = clusterName; + this.metricsRepository = metricsRepository; + this.statsFactory = statsSupplier; + this.totalStats = statsSupplier.get( metricsRepository, - statsSupplier, - statsSupplier.get(metricsRepository, STORE_NAME_FOR_TOTAL_STAT + "." + clusterName, null)); + perClusterAggregate ? STORE_NAME_FOR_TOTAL_STAT + "." + clusterName : STORE_NAME_FOR_TOTAL_STAT, + clusterName, + null); } public T getStoreStats(String storeName) { - return storeStats.computeIfAbsent(storeName, k -> statsFactory.get(metricsRepository, storeName, totalStats)); + return storeStats + .computeIfAbsent(storeName, k -> statsFactory.get(metricsRepository, storeName, clusterName, totalStats)); } public T getNullableStoreStats(String storeName) { diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/StatsSupplier.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/StatsSupplier.java index 9967d93c3f..f42c97b03e 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/StatsSupplier.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/StatsSupplier.java @@ -7,15 +7,15 @@ public interface StatsSupplier { /** * Legacy function, for implementations that do not use total stats in their constructor. * - * @see #get(MetricsRepository, String, AbstractVeniceStats) which is the only caller. + * @see #get(MetricsRepository, String, String, AbstractVeniceStats) which is the only caller. */ - T get(MetricsRepository metricsRepository, String storeName); + T get(MetricsRepository metricsRepository, String storeName, String clusterName); /** * This is the function that gets called by {@link AbstractVeniceAggStats}, and concrete classes can * optionally implement it in order to be provided with the total stats instance. */ - default T get(MetricsRepository metricsRepository, String storeName, T totalStats) { - return get(metricsRepository, storeName); + default T get(MetricsRepository metricsRepository, String storeName, String clusterName, T totalStats) { + return get(metricsRepository, storeName, clusterName); } } diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceMetricsConfig.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceMetricsConfig.java new file mode 100644 index 0000000000..0a2a1da619 --- /dev/null +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceMetricsConfig.java @@ -0,0 +1,477 @@ +package com.linkedin.venice.stats; + +import static com.linkedin.venice.stats.VeniceOpenTelemetryMetricNamingFormat.SNAKE_CASE; + +import com.linkedin.venice.stats.metrics.MetricEntity; +import io.opentelemetry.exporter.otlp.internal.OtlpConfigUtil; +import io.opentelemetry.sdk.metrics.export.AggregationTemporalitySelector; +import io.opentelemetry.sdk.metrics.export.MetricExporter; +import io.tehuti.metrics.MetricConfig; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + + +/** + * Configuration for metrics emitted by Venice: Holds OpenTelemetry as well as Tehuti configs
+ * + * Configs starting with "otel.venice." are venice specific configs for OpenTelemetry metrics
+ * other configs starting with "otel.exporter." are generic OpenTelemetry exporter configs but + * are parsed in this class and used setters to configure otel exporter. + */ +public class VeniceMetricsConfig { + private static final Logger LOGGER = LogManager.getLogger(VeniceMetricsConfig.class); + + /** + * Config to enable OpenTelemetry metrics + */ + public static final String OTEL_VENICE_METRICS_ENABLED = "otel.venice.metrics.enabled"; + + /** + * Config to set the metric prefix for OpenTelemetry metrics + */ + public static final String OTEL_VENICE_METRICS_PREFIX = "otel.venice.metrics.prefix"; + + /** + * Config to set the naming format for OpenTelemetry metrics + * {@link VeniceOpenTelemetryMetricNamingFormat} + */ + public static final String OTEL_VENICE_METRICS_NAMING_FORMAT = "otel.venice.metrics.naming.format"; + + /** + * Export opentelemetry metrics to a log exporter + * {@link VeniceOpenTelemetryMetricsRepository.LogBasedMetricExporter} + */ + public static final String OTEL_VENICE_METRICS_EXPORT_TO_LOG = "otel.venice.metrics.export.to.log"; + + /** + * Export opentelemetry metrics to {@link #OTEL_EXPORTER_OTLP_METRICS_ENDPOINT} + * over {@link #OTEL_EXPORTER_OTLP_METRICS_PROTOCOL} + */ + public static final String OTEL_VENICE_METRICS_EXPORT_TO_ENDPOINT = "otel.venice.metrics.export.to.endpoint"; + + /** + * Config Map to add custom dimensions to the metrics: Can be used for system dimensions + * amongst other custom dimensions
+ * These will be emitted along with all the metrics emitted. + * + * + * custom dimensions are passed as key=value pairs separated by '=' + * Multiple headers are separated by ',' + * For example: "custom_dimension_one=value1,custom_dimension_two=value2,custom_dimension_three=value3" + */ + public static final String OTEL_VENICE_METRICS_CUSTOM_DIMENSIONS_MAP = "otel.venice.metrics.custom.dimensions.map"; + + /** + * Protocol over which the metrics are exported to {@link #OTEL_EXPORTER_OTLP_METRICS_ENDPOINT}
+ * 1. {@link OtlpConfigUtil#PROTOCOL_HTTP_PROTOBUF} => "http/protobuf"
+ * 2. {@link OtlpConfigUtil#PROTOCOL_GRPC} => "grpc" + */ + public static final String OTEL_EXPORTER_OTLP_METRICS_PROTOCOL = "otel.exporter.otlp.metrics.protocol"; + + /** + * The Endpoint to which the metrics are exported + */ + public static final String OTEL_EXPORTER_OTLP_METRICS_ENDPOINT = "otel.exporter.otlp.metrics.endpoint"; + + /** + * Additional headers to pass while creating OpenTelemetry exporter + */ + public static final String OTEL_EXPORTER_OTLP_METRICS_HEADERS = "otel.exporter.otlp.metrics.headers"; + + /** + * Aggregation Temporality selector to export only the delta or cumulate or different + */ + public static final String OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE = + "otel.exporter.otlp.metrics.temporality.preference"; + + /** + * Default histogram aggregation to be used for all histograms: Select one of the below
+ * 1. base2_exponential_bucket_histogram
+ * 2. explicit_bucket_histogram + */ + public static final String OTEL_EXPORTER_OTLP_METRICS_DEFAULT_HISTOGRAM_AGGREGATION = + "otel.exporter.otlp.metrics.default.histogram.aggregation"; + + /** + * Max scale for base2_exponential_bucket_histogram + */ + public static final String OTEL_EXPORTER_OTLP_METRICS_DEFAULT_HISTOGRAM_AGGREGATION_MAX_SCALE = + "otel.exporter.otlp.metrics.default.histogram.aggregation.max.scale"; + + /** + * Max buckets for base2_exponential_bucket_histogram + */ + public static final String OTEL_EXPORTER_OTLP_METRICS_DEFAULT_HISTOGRAM_AGGREGATION_MAX_BUCKETS = + "otel.exporter.otlp.metrics.default.histogram.aggregation.max.buckets"; + + private final String serviceName; + private final String metricPrefix; + private final Collection metricEntities; + /** reusing tehuti's MetricConfig */ + private final MetricConfig tehutiMetricConfig; + + /** Below are the configs for OpenTelemetry metrics */ + + /** Feature flag to use OpenTelemetry instrumentation for metrics or not */ + private final boolean emitOTelMetrics; + + /** extra configs for OpenTelemetry. Supports 2 exporter currently
+ * 1. {@link MetricExporter} for exporting to Http/Grpc endpoint. More details are supported via configs, + * check {@link Builder#extractAndSetOtelConfigs} and {@link VeniceOpenTelemetryMetricsRepository#getOtlpHttpMetricExporter}
+ * 2. {@link VeniceOpenTelemetryMetricsRepository.LogBasedMetricExporter} for debug purposes + */ + private final boolean exportOtelMetricsToEndpoint; + private final boolean exportOtelMetricsToLog; + + /** Custom dimensions */ + private final Map otelCustomDimensionsMap; + + /** + * protocol for OpenTelemetry exporter. supports + * 1. {@link OtlpConfigUtil#PROTOCOL_HTTP_PROTOBUF} => "http/protobuf" + * 2. {@link OtlpConfigUtil#PROTOCOL_GRPC} => "grpc" + */ + private final String otelExportProtocol; + + /** endpoint to export OpenTelemetry Metrics to */ + private final String otelEndpoint; + + /** Headers to be passed while creating OpenTelemetry exporter */ + private final Map otelHeaders; + + /** Metric naming conventions for OpenTelemetry metrics */ + private final VeniceOpenTelemetryMetricNamingFormat metricNamingFormat; + + /** Aggregation Temporality selector to export only the delta or cumulate or different */ + private final AggregationTemporalitySelector otelAggregationTemporalitySelector; + + /** Default histogram aggregation to be used for all histograms: exponential or explicit bucket histogram */ + private final boolean useOtelExponentialHistogram; + private final int otelExponentialHistogramMaxScale; + private final int otelExponentialHistogramMaxBuckets; + + private VeniceMetricsConfig(Builder builder) { + this.serviceName = builder.serviceName; + this.metricPrefix = builder.metricPrefix; + this.metricEntities = builder.metricEntities; + this.emitOTelMetrics = builder.emitOtelMetrics; + this.exportOtelMetricsToEndpoint = builder.exportOtelMetricsToEndpoint; + this.otelCustomDimensionsMap = builder.otelCustomDimensionsMap; + this.otelExportProtocol = builder.otelExportProtocol; + this.otelEndpoint = builder.otelEndpoint; + this.otelHeaders = builder.otelHeaders; + this.exportOtelMetricsToLog = builder.exportOtelMetricsToLog; + this.metricNamingFormat = builder.metricNamingFormat; + this.otelAggregationTemporalitySelector = builder.otelAggregationTemporalitySelector; + this.useOtelExponentialHistogram = builder.useOtelExponentialHistogram; + this.otelExponentialHistogramMaxScale = builder.otelExponentialHistogramMaxScale; + this.otelExponentialHistogramMaxBuckets = builder.otelExponentialHistogramMaxBuckets; + this.tehutiMetricConfig = builder.tehutiMetricConfig; + } + + public static class Builder { + private String serviceName = "default_service"; + private String metricPrefix = null; + private Collection metricEntities = new ArrayList<>(); + private boolean emitOtelMetrics = false; + private boolean exportOtelMetricsToEndpoint = false; + private Map otelCustomDimensionsMap = new HashMap<>(); + private String otelExportProtocol = OtlpConfigUtil.PROTOCOL_HTTP_PROTOBUF; + private String otelEndpoint = null; + Map otelHeaders = new HashMap<>(); + private boolean exportOtelMetricsToLog = false; + private VeniceOpenTelemetryMetricNamingFormat metricNamingFormat = SNAKE_CASE; + private AggregationTemporalitySelector otelAggregationTemporalitySelector = + AggregationTemporalitySelector.deltaPreferred(); + private boolean useOtelExponentialHistogram = true; + private int otelExponentialHistogramMaxScale = 3; + private int otelExponentialHistogramMaxBuckets = 250; + private MetricConfig tehutiMetricConfig = null; + + public Builder setServiceName(String serviceName) { + this.serviceName = serviceName; + return this; + } + + public Builder setMetricPrefix(String metricPrefix) { + this.metricPrefix = metricPrefix; + return this; + } + + public Builder setMetricEntities(Collection metricEntities) { + this.metricEntities = metricEntities; + return this; + } + + public Builder setEmitOtelMetrics(boolean emitOtelMetrics) { + this.emitOtelMetrics = emitOtelMetrics; + return this; + } + + public Builder setExportOtelMetricsToEndpoint(boolean exportOtelMetricsToEndpoint) { + this.exportOtelMetricsToEndpoint = exportOtelMetricsToEndpoint; + return this; + } + + public Builder setOtelExportProtocol(String otelExportProtocol) { + this.otelExportProtocol = otelExportProtocol; + return this; + } + + public Builder setOtelEndpoint(String otelEndpoint) { + this.otelEndpoint = otelEndpoint; + return this; + } + + public Builder setExportOtelMetricsToLog(boolean exportOtelMetricsToLog) { + this.exportOtelMetricsToLog = exportOtelMetricsToLog; + return this; + } + + public Builder setMetricNamingFormat(VeniceOpenTelemetryMetricNamingFormat metricNamingFormat) { + this.metricNamingFormat = metricNamingFormat; + return this; + } + + public Builder setOtelAggregationTemporalitySelector( + AggregationTemporalitySelector otelAggregationTemporalitySelector) { + this.otelAggregationTemporalitySelector = otelAggregationTemporalitySelector; + return this; + } + + public Builder setUseOtelExponentialHistogram(boolean useOtelExponentialHistogram) { + this.useOtelExponentialHistogram = useOtelExponentialHistogram; + return this; + } + + public Builder setOtelExponentialHistogramMaxScale(int otelExponentialHistogramMaxScale) { + this.otelExponentialHistogramMaxScale = otelExponentialHistogramMaxScale; + return this; + } + + public Builder setOtelExponentialHistogramMaxBuckets(int otelExponentialHistogramMaxBuckets) { + this.otelExponentialHistogramMaxBuckets = otelExponentialHistogramMaxBuckets; + return this; + } + + /** + * Extract and set otel configs + */ + public Builder extractAndSetOtelConfigs(Map configs) { + String configValue; + if ((configValue = configs.get(OTEL_VENICE_METRICS_ENABLED)) != null) { + setEmitOtelMetrics(Boolean.parseBoolean(configValue)); + } + + if ((configValue = configs.get(OTEL_VENICE_METRICS_PREFIX)) != null) { + setMetricPrefix(configValue); + } + + if ((configValue = configs.get(OTEL_VENICE_METRICS_EXPORT_TO_LOG)) != null) { + setExportOtelMetricsToLog(Boolean.parseBoolean(configValue)); + } + + if ((configValue = configs.get(OTEL_VENICE_METRICS_EXPORT_TO_ENDPOINT)) != null) { + setExportOtelMetricsToEndpoint(Boolean.parseBoolean(configValue)); + } + + /** + * custom dimensions are passed as key=value pairs separated by '='
+ * Multiple dimensions are separated by ',' + */ + if ((configValue = configs.get(OTEL_VENICE_METRICS_CUSTOM_DIMENSIONS_MAP)) != null) { + String[] dimensions = configValue.split(","); + for (String dimension: dimensions) { + String[] keyValue = dimension.split("="); + if (keyValue.length != 2) { + throw new IllegalArgumentException("Invalid custom dimensions: " + configValue); + } + otelCustomDimensionsMap.put(keyValue[0], keyValue[1]); + } + } + + if ((configValue = configs.get(OTEL_EXPORTER_OTLP_METRICS_PROTOCOL)) != null) { + setOtelExportProtocol(configValue); + } + + if ((configValue = configs.get(OTEL_VENICE_METRICS_NAMING_FORMAT)) != null) { + setMetricNamingFormat(VeniceOpenTelemetryMetricNamingFormat.valueOf(configValue.toUpperCase(Locale.ROOT))); + } + + if ((configValue = configs.get(OTEL_EXPORTER_OTLP_METRICS_ENDPOINT)) != null) { + // validate endpoint: TODO + setOtelEndpoint(configValue); + } + + /** + * Headers are passed as key=value pairs separated by '=' + * Multiple headers are separated by ',' + * + * Currently supporting 1 header + */ + if ((configValue = configs.get(OTEL_EXPORTER_OTLP_METRICS_HEADERS)) != null) { + String[] headers = configValue.split("="); + otelHeaders.put(headers[0], headers[1]); + } + + if ((configValue = configs.get(OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE)) != null) { + switch (configValue.toLowerCase(Locale.ROOT)) { + case "cumulative": + setOtelAggregationTemporalitySelector(AggregationTemporalitySelector.alwaysCumulative()); + break; + case "delta": + setOtelAggregationTemporalitySelector(AggregationTemporalitySelector.deltaPreferred()); + break; + case "lowmemory": + setOtelAggregationTemporalitySelector(AggregationTemporalitySelector.lowMemory()); + break; + default: + throw new IllegalArgumentException("Unrecognized aggregation temporality: " + configValue); + } + } + + if ((configValue = configs.get(OTEL_EXPORTER_OTLP_METRICS_DEFAULT_HISTOGRAM_AGGREGATION)) != null) { + switch (configValue.toLowerCase(Locale.ROOT)) { + case "base2_exponential_bucket_histogram": + setUseOtelExponentialHistogram(true); + String maxScaleValue = configs.get(OTEL_EXPORTER_OTLP_METRICS_DEFAULT_HISTOGRAM_AGGREGATION_MAX_SCALE); + setOtelExponentialHistogramMaxScale(Integer.parseInt(maxScaleValue)); + String maxBucketValue = configs.get(OTEL_EXPORTER_OTLP_METRICS_DEFAULT_HISTOGRAM_AGGREGATION_MAX_BUCKETS); + setOtelExponentialHistogramMaxBuckets(Integer.parseInt(maxBucketValue)); + break; + + case "explicit_bucket_histogram": + setUseOtelExponentialHistogram(false); + break; + + default: + throw new IllegalArgumentException("Unrecognized default histogram aggregation: " + configValue); + } + } + + // todo: add more configs + // "otel.exporter.otlp.metrics.compression" + // "otel.exporter.otlp.metrics.timeout" + return this; + } + + public Builder setTehutiMetricConfig(MetricConfig tehutiMetricConfig) { + this.tehutiMetricConfig = tehutiMetricConfig; + return this; + } + + // Validate required fields before building + private void checkAndSetDefaults() { + if (tehutiMetricConfig == null) { + setTehutiMetricConfig(new MetricConfig()); + } + + if (metricPrefix == null) { + LOGGER.warn("metricPrefix is not set. Defaulting to empty string"); + setMetricPrefix(""); + } + + if (emitOtelMetrics) { + if (exportOtelMetricsToEndpoint) { + if (otelEndpoint == null) { + throw new IllegalArgumentException("endpoint is required to configure OpenTelemetry metrics export"); + } + + } else { + LOGGER.warn("OpenTelemetry metrics are enabled but no endpoint is configured to export metrics"); + } + } else { + LOGGER.warn("OpenTelemetry metrics are disabled"); + } + } + + public VeniceMetricsConfig build() { + checkAndSetDefaults(); + return new VeniceMetricsConfig(this); + } + } + + // all getters + public String getServiceName() { + return this.serviceName; + } + + public String getMetricPrefix() { + return this.metricPrefix; + } + + public Collection getMetricEntities() { + return this.metricEntities; + } + + public boolean emitOtelMetrics() { + return emitOTelMetrics; + } + + public boolean exportOtelMetricsToEndpoint() { + return exportOtelMetricsToEndpoint; + } + + public Map getOtelCustomDimensionsMap() { + return otelCustomDimensionsMap; + } + + public String getOtelExportProtocol() { + return otelExportProtocol; + } + + public String getOtelEndpoint() { + return otelEndpoint; + } + + public boolean exportOtelMetricsToLog() { + return exportOtelMetricsToLog; + } + + public Map getOtelHeaders() { + return otelHeaders; + } + + public VeniceOpenTelemetryMetricNamingFormat getMetricNamingFormat() { + return metricNamingFormat; + } + + public AggregationTemporalitySelector getOtelAggregationTemporalitySelector() { + return otelAggregationTemporalitySelector; + } + + public boolean useOtelExponentialHistogram() { + return useOtelExponentialHistogram; + } + + public int getOtelExponentialHistogramMaxScale() { + return otelExponentialHistogramMaxScale; + } + + public int getOtelExponentialHistogramMaxBuckets() { + return otelExponentialHistogramMaxBuckets; + } + + public MetricConfig getTehutiMetricConfig() { + return tehutiMetricConfig; + } + + @Override + public String toString() { + return "VeniceMetricsConfig{" + "serviceName='" + serviceName + '\'' + ", metricPrefix='" + metricPrefix + '\'' + + ", metricEntities=" + metricEntities + ", emitOTelMetrics=" + emitOTelMetrics + + ", exportOtelMetricsToEndpoint=" + exportOtelMetricsToEndpoint + ", otelCustomDimensionsMap=" + + otelCustomDimensionsMap + ", otelExportProtocol='" + otelExportProtocol + '\'' + ", otelEndpoint='" + + otelEndpoint + '\'' + ", otelHeaders=" + otelHeaders + ", exportOtelMetricsToLog=" + exportOtelMetricsToLog + + ", metricNamingFormat=" + metricNamingFormat + ", otelAggregationTemporalitySelector=" + + otelAggregationTemporalitySelector + ", useOtelExponentialHistogram=" + useOtelExponentialHistogram + + ", otelExponentialHistogramMaxScale=" + otelExponentialHistogramMaxScale + + ", otelExponentialHistogramMaxBuckets=" + otelExponentialHistogramMaxBuckets + ", tehutiMetricConfig=" + + tehutiMetricConfig + '}'; + } +} diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceMetricsRepository.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceMetricsRepository.java new file mode 100644 index 0000000000..811b3587f9 --- /dev/null +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceMetricsRepository.java @@ -0,0 +1,70 @@ +package com.linkedin.venice.stats; + +import com.linkedin.venice.stats.metrics.MetricEntity; +import io.tehuti.metrics.JmxReporter; +import io.tehuti.metrics.MetricsRepository; +import java.io.Closeable; +import java.util.Collection; +import java.util.Map; + + +/** + * Repository to hold both tehuti and OpenTelemetry metrics. + * This class extends {@link MetricsRepository} to keep the changes to a minimum and + * to avoid a breaking change.
+ * Once all components are migrated to use this class: make this class add {@link MetricsRepository} + * as a member variable and delegate all tehuti calls to it. + */ +public class VeniceMetricsRepository extends MetricsRepository implements Closeable { + private final VeniceMetricsConfig veniceMetricsConfig; + private final VeniceOpenTelemetryMetricsRepository openTelemetryMetricsRepository; + + public VeniceMetricsRepository() { + super(); + this.veniceMetricsConfig = new VeniceMetricsConfig.Builder().build(); + this.openTelemetryMetricsRepository = new VeniceOpenTelemetryMetricsRepository(veniceMetricsConfig); + } + + public VeniceMetricsRepository(VeniceMetricsConfig veniceMetricsConfig) { + this(veniceMetricsConfig, new VeniceOpenTelemetryMetricsRepository(veniceMetricsConfig)); + } + + public VeniceMetricsRepository( + VeniceMetricsConfig veniceMetricsConfig, + VeniceOpenTelemetryMetricsRepository openTelemetryMetricsRepository) { + super(veniceMetricsConfig.getTehutiMetricConfig()); + this.veniceMetricsConfig = veniceMetricsConfig; + this.openTelemetryMetricsRepository = openTelemetryMetricsRepository; + } + + public VeniceOpenTelemetryMetricsRepository getOpenTelemetryMetricsRepository() { + return this.openTelemetryMetricsRepository; + } + + public VeniceMetricsConfig getVeniceMetricsConfig() { + return veniceMetricsConfig; + } + + @Override + public void close() { + super.close(); + if (openTelemetryMetricsRepository != null) { + openTelemetryMetricsRepository.close(); + } + } + + public static VeniceMetricsRepository getVeniceMetricsRepository( + String serviceName, + String metricPrefix, + Collection metricEntities, + Map configs) { + VeniceMetricsRepository metricsRepository = new VeniceMetricsRepository( + new VeniceMetricsConfig.Builder().setServiceName(serviceName) + .setMetricPrefix(metricPrefix) + .setMetricEntities(metricEntities) + .extractAndSetOtelConfigs(configs) + .build()); + metricsRepository.addReporter(new JmxReporter(serviceName)); + return metricsRepository; + } +} diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricNamingFormat.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricNamingFormat.java new file mode 100644 index 0000000000..b8a3caf2b6 --- /dev/null +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricNamingFormat.java @@ -0,0 +1,93 @@ +package com.linkedin.venice.stats; + +import com.linkedin.venice.utils.VeniceEnumValue; + + +public enum VeniceOpenTelemetryMetricNamingFormat implements VeniceEnumValue { + /** + * Default format if not configured, names are defined as per this. + * should use snake case as per https://opentelemetry.io/docs/specs/semconv/general/attribute-naming/ + * For example: http.response.status_code + */ + SNAKE_CASE(0), + /** + * Alternate format for attribute names. If configured, defined names in snake_case will be + * transformed to either one of below formats. + * + * camel case: For example, http.response.statusCode + * pascal case: For example, Http.Response.StatusCode + */ + CAMEL_CASE(1), PASCAL_CASE(2); + + private final int value; + + VeniceOpenTelemetryMetricNamingFormat(int value) { + this.value = value; + } + + public static final int SIZE = values().length; + + @Override + public int getValue() { + return value; + } + + /** + * validate whether the input name is defined as a valid {@link VeniceOpenTelemetryMetricNamingFormat#SNAKE_CASE} + */ + public static void validateMetricName(String name) { + if (name == null || name.isEmpty()) { + throw new IllegalArgumentException("Metric name cannot be null or empty. Input name: " + name); + } + if (name.contains(" ")) { + throw new IllegalArgumentException("Metric name cannot contain spaces. Input name: " + name); + } + // name should not contain any capital or special characters except for underscore and dot + if (!name.matches("^[a-z0-9_.]*$")) { + throw new IllegalArgumentException( + "Metric name can only contain lowercase alphabets, numbers, underscore and dot. Input name: " + name); + } + } + + public static String transformMetricName(String input, VeniceOpenTelemetryMetricNamingFormat metricFormat) { + if (metricFormat == SNAKE_CASE) { + // no transformation needed as it should be defined in snake case by default + validateMetricName(input); + return input; + } + String[] words = input.split("\\."); + for (int i = 0; i < words.length; i++) { + if (!words[i].isEmpty()) { + String[] partWords = words[i].split("_"); + for (int j = 0; j < partWords.length; j++) { + if (metricFormat == PASCAL_CASE || j > 0) { + // either pascal case or camel case except for the first word + partWords[j] = capitalizeFirstLetter(partWords[j]); + } + } + StringBuilder sb = new StringBuilder(); + for (String partWord: partWords) { + sb.append(partWord); + } + words[i] = sb.toString(); + } + } + StringBuilder finalName = new StringBuilder(); + for (String word: words) { + finalName.append(word); + finalName.append("."); + } + // remove the last dot + if (finalName.length() > 0) { + finalName.deleteCharAt(finalName.length() - 1); + } + return finalName.toString(); + } + + private static String capitalizeFirstLetter(String word) { + if (word.isEmpty()) { + return word; + } + return Character.toUpperCase(word.charAt(0)) + word.substring(1); + } +} diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricsRepository.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricsRepository.java new file mode 100644 index 0000000000..912f8619a2 --- /dev/null +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricsRepository.java @@ -0,0 +1,257 @@ +package com.linkedin.venice.stats; + +import static com.linkedin.venice.stats.VeniceOpenTelemetryMetricNamingFormat.transformMetricName; +import static com.linkedin.venice.stats.VeniceOpenTelemetryMetricNamingFormat.validateMetricName; + +import com.linkedin.venice.exceptions.VeniceException; +import com.linkedin.venice.stats.metrics.MetricEntity; +import com.linkedin.venice.stats.metrics.MetricType; +import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; +import io.opentelemetry.api.OpenTelemetry; +import io.opentelemetry.api.metrics.DoubleHistogram; +import io.opentelemetry.api.metrics.DoubleHistogramBuilder; +import io.opentelemetry.api.metrics.LongCounter; +import io.opentelemetry.api.metrics.LongCounterBuilder; +import io.opentelemetry.api.metrics.Meter; +import io.opentelemetry.exporter.otlp.http.metrics.OtlpHttpMetricExporter; +import io.opentelemetry.exporter.otlp.http.metrics.OtlpHttpMetricExporterBuilder; +import io.opentelemetry.sdk.OpenTelemetrySdk; +import io.opentelemetry.sdk.common.CompletableResultCode; +import io.opentelemetry.sdk.metrics.Aggregation; +import io.opentelemetry.sdk.metrics.InstrumentSelector; +import io.opentelemetry.sdk.metrics.InstrumentSelectorBuilder; +import io.opentelemetry.sdk.metrics.InstrumentType; +import io.opentelemetry.sdk.metrics.SdkMeterProvider; +import io.opentelemetry.sdk.metrics.SdkMeterProviderBuilder; +import io.opentelemetry.sdk.metrics.View; +import io.opentelemetry.sdk.metrics.data.AggregationTemporality; +import io.opentelemetry.sdk.metrics.data.MetricData; +import io.opentelemetry.sdk.metrics.export.MetricExporter; +import io.opentelemetry.sdk.metrics.export.PeriodicMetricReader; +import io.opentelemetry.sdk.resources.Resource; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + + +public class VeniceOpenTelemetryMetricsRepository { + private static final Logger LOGGER = LogManager.getLogger(VeniceOpenTelemetryMetricsRepository.class); + private SdkMeterProvider sdkMeterProvider = null; + private final boolean emitOpenTelemetryMetrics; + private final VeniceOpenTelemetryMetricNamingFormat metricFormat; + private Meter meter; + + private String metricPrefix; + + /** Below Maps are to create only one metric per name and type: Venice code will try to initialize the same metric multiple times as it will get + * called from per store path and per request type path. This will ensure that we only have one metric per name and + * use dimensions to differentiate between them. + */ + private final VeniceConcurrentHashMap histogramMap = new VeniceConcurrentHashMap<>(); + private final VeniceConcurrentHashMap counterMap = new VeniceConcurrentHashMap<>(); + + MetricExporter getOtlpHttpMetricExporter(VeniceMetricsConfig metricsConfig) { + OtlpHttpMetricExporterBuilder exporterBuilder = + OtlpHttpMetricExporter.builder().setEndpoint(metricsConfig.getOtelEndpoint()); + for (Map.Entry entry: metricsConfig.getOtelHeaders().entrySet()) { + String key = entry.getKey(); + String value = entry.getValue(); + exporterBuilder.addHeader(key, value); + } + if (metricsConfig.getOtelAggregationTemporalitySelector() != null) { + exporterBuilder.setAggregationTemporalitySelector(metricsConfig.getOtelAggregationTemporalitySelector()); + } + return exporterBuilder.build(); + } + + /** + * Setting Exponential Histogram aggregation for {@link MetricType#HISTOGRAM} by looping through all + * the metric entities set for this service to registering the view with exponential histogram aggregation for + * all the {@link MetricType#HISTOGRAM} metrics. + * + * There is a limitation in opentelemetry sdk to configure different histogram aggregation for different + * instruments, so {@link OtlpHttpMetricExporterBuilder#setDefaultAggregationSelector} to enable exponential + * histogram aggregation is not used here to not convert the histograms of type {@link MetricType#MIN_MAX_COUNT_SUM_AGGREGATIONS} + * to exponential histograms to be able to follow explict boundaries. + */ + private void setExponentialHistogramAggregation(SdkMeterProviderBuilder builder, VeniceMetricsConfig metricsConfig) { + List metricNames = new ArrayList<>(); + + if (metricsConfig.getMetricEntities().isEmpty()) { + LOGGER + .warn("No metric entities found in config: {} to configure exponential histogram", metricsConfig.toString()); + } + + for (MetricEntity metricEntity: metricsConfig.getMetricEntities()) { + if (metricEntity.getMetricType() == MetricType.HISTOGRAM) { + metricNames.add(getFullMetricName(getMetricPrefix(), metricEntity.getMetricName())); + } + } + + // Build an InstrumentSelector with multiple setName calls for all Exponential Histogram metrics + InstrumentSelectorBuilder selectorBuilder = InstrumentSelector.builder().setType(InstrumentType.HISTOGRAM); + metricNames.forEach(selectorBuilder::setName); + + // Register a single view with all metric names included in the InstrumentSelector + builder.registerView( + selectorBuilder.build(), + View.builder() + .setAggregation( + Aggregation.base2ExponentialBucketHistogram( + metricsConfig.getOtelExponentialHistogramMaxBuckets(), + metricsConfig.getOtelExponentialHistogramMaxScale())) + .build()); + } + + public VeniceOpenTelemetryMetricsRepository(VeniceMetricsConfig metricsConfig) { + emitOpenTelemetryMetrics = metricsConfig.emitOtelMetrics(); + metricFormat = metricsConfig.getMetricNamingFormat(); + if (!emitOpenTelemetryMetrics) { + LOGGER.info("OpenTelemetry metrics are disabled"); + return; + } + LOGGER.info( + "OpenTelemetry initialization for {} started with config: {}", + metricsConfig.getServiceName(), + metricsConfig.toString()); + this.metricPrefix = "venice." + metricsConfig.getMetricPrefix(); + validateMetricName(this.metricPrefix); + try { + SdkMeterProviderBuilder builder = SdkMeterProvider.builder(); + if (metricsConfig.exportOtelMetricsToEndpoint()) { + MetricExporter httpExporter = getOtlpHttpMetricExporter(metricsConfig); + builder.registerMetricReader(PeriodicMetricReader.builder(httpExporter).build()); + } + if (metricsConfig.exportOtelMetricsToLog()) { + // internal to test: Disabled by default + builder.registerMetricReader(PeriodicMetricReader.builder(new LogBasedMetricExporter(metricsConfig)).build()); + } + + if (metricsConfig.useOtelExponentialHistogram()) { + setExponentialHistogramAggregation(builder, metricsConfig); + } + + builder.setResource(Resource.empty()); + sdkMeterProvider = builder.build(); + + // Register MeterProvider with the OpenTelemetry instance + OpenTelemetry openTelemetry = OpenTelemetrySdk.builder().setMeterProvider(sdkMeterProvider).build(); + + this.meter = openTelemetry.getMeter(transformMetricName(getMetricPrefix(), metricFormat)); + LOGGER.info( + "OpenTelemetry initialization for {} completed with config: {}", + metricsConfig.getServiceName(), + metricsConfig.toString()); + } catch (Exception e) { + String err = "OpenTelemetry initialization for " + metricsConfig.getServiceName() + " failed with config: " + + metricsConfig.toString(); + LOGGER.error(err, e); + throw new VeniceException(err, e); + } + } + + String getFullMetricName(String metricPrefix, String name) { + String fullMetricName = metricPrefix + "." + name; + validateMetricName(fullMetricName); + return transformMetricName(fullMetricName, metricFormat); + } + + private String getMetricPrefix() { + return metricPrefix; + } + + public DoubleHistogram createHistogram(MetricEntity metricEntity) { + if (!emitOpenTelemetryMetrics) { + return null; + } + return histogramMap.computeIfAbsent(metricEntity.getMetricName(), key -> { + String fullMetricName = getFullMetricName(getMetricPrefix(), metricEntity.getMetricName()); + DoubleHistogramBuilder builder = meter.histogramBuilder(fullMetricName) + .setUnit(metricEntity.getUnit().name()) + .setDescription(metricEntity.getDescription()); + if (metricEntity.getMetricType() == MetricType.MIN_MAX_COUNT_SUM_AGGREGATIONS) { + // No buckets needed to get only min/max/count/sum aggregations + builder.setExplicitBucketBoundariesAdvice(new ArrayList<>()); + } + return builder.build(); + }); + } + + public LongCounter createCounter(MetricEntity metricEntity) { + if (!emitOpenTelemetryMetrics) { + return null; + } + return counterMap.computeIfAbsent(metricEntity.getMetricName(), key -> { + String fullMetricName = getFullMetricName(getMetricPrefix(), metricEntity.getMetricName()); + LongCounterBuilder builder = meter.counterBuilder(fullMetricName) + .setUnit(metricEntity.getUnit().name()) + .setDescription(metricEntity.getDescription()); + return builder.build(); + }); + } + + public Object createInstrument(MetricEntity metricEntity) { + MetricType metricType = metricEntity.getMetricType(); + switch (metricType) { + case HISTOGRAM: + case MIN_MAX_COUNT_SUM_AGGREGATIONS: + return createHistogram(metricEntity); + + case COUNTER: + return createCounter(metricEntity); + + default: + throw new VeniceException("Unknown metric type: " + metricType); + } + } + + public void close() { + if (sdkMeterProvider != null) { + sdkMeterProvider.shutdown(); + sdkMeterProvider = null; + } + } + + class LogBasedMetricExporter implements MetricExporter { + VeniceMetricsConfig metricsConfig; + + LogBasedMetricExporter(VeniceMetricsConfig metricsConfig) { + this.metricsConfig = metricsConfig; + } + + @Override + public AggregationTemporality getAggregationTemporality(InstrumentType instrumentType) { + return metricsConfig.getOtelAggregationTemporalitySelector().getAggregationTemporality(instrumentType); + } + + @Override + public CompletableResultCode export(Collection metrics) { + LOGGER.info("Logging OpenTelemetry metrics for debug purpose: {}", Arrays.toString(metrics.toArray())); + return CompletableResultCode.ofSuccess(); + } + + @Override + public CompletableResultCode flush() { + return CompletableResultCode.ofSuccess(); + } + + @Override + public CompletableResultCode shutdown() { + return CompletableResultCode.ofSuccess(); + } + } + + /** for testing purposes */ + SdkMeterProvider getSdkMeterProvider() { + return sdkMeterProvider; + } + + /** for testing purposes */ + Meter getMeter() { + return meter; + } +} diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/HttpResponseStatusCodeCategory.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/HttpResponseStatusCodeCategory.java new file mode 100644 index 0000000000..c357326370 --- /dev/null +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/HttpResponseStatusCodeCategory.java @@ -0,0 +1,41 @@ +package com.linkedin.venice.stats.dimensions; + +import io.netty.handler.codec.http.HttpResponseStatus; +import io.netty.handler.codec.http.HttpStatusClass; + + +/** + * Maps the provided HTTP response status {@link HttpResponseStatus} to one of + * 1xx, 2xx, 3xx, 4xx, 5xx categories. + */ +public class HttpResponseStatusCodeCategory { + private static final String UNKNOWN_CATEGORY = "unknown"; + + /** + * Private constructor to prevent instantiation of this Utility class + */ + private HttpResponseStatusCodeCategory() { + } + + public static String getVeniceHttpResponseStatusCodeCategory(HttpResponseStatus statusCode) { + if (statusCode == null) { + return UNKNOWN_CATEGORY; + } + + HttpStatusClass statusClass = statusCode.codeClass(); + switch (statusClass) { + case INFORMATIONAL: + return "1xx"; + case SUCCESS: + return "2xx"; + case REDIRECTION: + return "3xx"; + case CLIENT_ERROR: + return "4xx"; + case SERVER_ERROR: + return "5xx"; + default: + return UNKNOWN_CATEGORY; + } + } +} diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/RequestRetryAbortReason.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/RequestRetryAbortReason.java new file mode 100644 index 0000000000..250352341e --- /dev/null +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/RequestRetryAbortReason.java @@ -0,0 +1,15 @@ +package com.linkedin.venice.stats.dimensions; + +public enum RequestRetryAbortReason { + SLOW_ROUTE, DELAY_CONSTRAINT, MAX_RETRY_ROUTE_LIMIT, NO_AVAILABLE_REPLICA; + + private final String abortReason; + + RequestRetryAbortReason() { + this.abortReason = name().toLowerCase(); + } + + public String getAbortReason() { + return this.abortReason; + } +} diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/RequestRetryType.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/RequestRetryType.java new file mode 100644 index 0000000000..1ef7a36964 --- /dev/null +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/RequestRetryType.java @@ -0,0 +1,15 @@ +package com.linkedin.venice.stats.dimensions; + +public enum RequestRetryType { + ERROR_RETRY, LONG_TAIL_RETRY; + + private final String retryType; + + RequestRetryType() { + this.retryType = name().toLowerCase(); + } + + public String getRetryType() { + return this.retryType; + } +} diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/RequestValidationOutcome.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/RequestValidationOutcome.java new file mode 100644 index 0000000000..84b6ce30c5 --- /dev/null +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/RequestValidationOutcome.java @@ -0,0 +1,15 @@ +package com.linkedin.venice.stats.dimensions; + +public enum RequestValidationOutcome { + VALID, INVALID_KEY_COUNT_LIMIT_EXCEEDED; + + private final String outcome; + + RequestValidationOutcome() { + this.outcome = name().toLowerCase(); + } + + public String getOutcome() { + return this.outcome; + } +} diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceMetricsDimensions.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceMetricsDimensions.java new file mode 100644 index 0000000000..54737cc534 --- /dev/null +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceMetricsDimensions.java @@ -0,0 +1,48 @@ +package com.linkedin.venice.stats.dimensions; + +import static com.linkedin.venice.stats.VeniceOpenTelemetryMetricNamingFormat.CAMEL_CASE; +import static com.linkedin.venice.stats.VeniceOpenTelemetryMetricNamingFormat.PASCAL_CASE; +import static com.linkedin.venice.stats.VeniceOpenTelemetryMetricNamingFormat.SNAKE_CASE; +import static com.linkedin.venice.stats.VeniceOpenTelemetryMetricNamingFormat.transformMetricName; +import static com.linkedin.venice.stats.VeniceOpenTelemetryMetricNamingFormat.validateMetricName; + +import com.linkedin.venice.stats.VeniceOpenTelemetryMetricNamingFormat; + + +public enum VeniceMetricsDimensions { + VENICE_STORE_NAME("venice.store.name"), VENICE_CLUSTER_NAME("venice.cluster.name"), + + /** {@link com.linkedin.venice.read.RequestType} */ + VENICE_REQUEST_METHOD("venice.request.method"), + + /** {@link io.netty.handler.codec.http.HttpResponseStatus} ie. 200, 400, etc */ + HTTP_RESPONSE_STATUS_CODE("http.response.status_code"), + + /** {@link HttpResponseStatusCodeCategory} ie. 1xx, 2xx, etc */ + HTTP_RESPONSE_STATUS_CODE_CATEGORY("http.response.status_code_category"), + + /** {@link RequestValidationOutcome#outcome} */ + VENICE_REQUEST_VALIDATION_OUTCOME("venice.request.validation_outcome"), + + /** {@link VeniceResponseStatusCategory} */ + VENICE_RESPONSE_STATUS_CODE_CATEGORY("venice.response.status_code_category"), + + /** {@link RequestRetryType} */ + VENICE_REQUEST_RETRY_TYPE("venice.request.retry_type"), + + /** {@link RequestRetryAbortReason} */ + VENICE_REQUEST_RETRY_ABORT_REASON("venice.request.retry_abort_reason"); + + private final String[] dimensionName = new String[VeniceOpenTelemetryMetricNamingFormat.SIZE]; + + VeniceMetricsDimensions(String dimensionName) { + validateMetricName(dimensionName); + this.dimensionName[SNAKE_CASE.getValue()] = dimensionName; + this.dimensionName[CAMEL_CASE.getValue()] = transformMetricName(dimensionName, CAMEL_CASE); + this.dimensionName[PASCAL_CASE.getValue()] = transformMetricName(dimensionName, PASCAL_CASE); + } + + public String getDimensionName(VeniceOpenTelemetryMetricNamingFormat format) { + return dimensionName[format.getValue()]; + } +} diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceResponseStatusCategory.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceResponseStatusCategory.java new file mode 100644 index 0000000000..761c30cfdf --- /dev/null +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceResponseStatusCategory.java @@ -0,0 +1,22 @@ +package com.linkedin.venice.stats.dimensions; + +/** + * How Venice categorizes the response status of a request: + * We are emitting both {@link HttpResponseStatusCodeCategory} and this enum to capture the http standard as + * well as the Venice specific categorization. For instance, venice considers key not found as a healthy + * response, but http standard would consider it a 404 (4xx) which leads to checking for both 200 and 404 + * to account for all healthy requests. This dimensions makes it easier to make Venice specific aggregations. + */ +public enum VeniceResponseStatusCategory { + HEALTHY, UNHEALTHY, TARDY, THROTTLED, BAD_REQUEST; + + private final String category; + + VeniceResponseStatusCategory() { + this.category = name().toLowerCase(); + } + + public String getCategory() { + return this.category; + } +} diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntity.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntity.java new file mode 100644 index 0000000000..4419d41c7f --- /dev/null +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntity.java @@ -0,0 +1,66 @@ +package com.linkedin.venice.stats.metrics; + +import com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions; +import java.util.Set; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import org.apache.commons.lang.Validate; + + +/** + * Metric entity class to define a metric with all its properties + */ +public class MetricEntity { + private final String metricName; + private final MetricType metricType; + private final MetricUnit unit; + private final String description; + private final Set dimensionsList; + + public MetricEntity( + @Nonnull String metricName, + @Nonnull MetricType metricType, + @Nonnull MetricUnit unit, + @Nonnull String description) { + this(metricName, metricType, unit, description, null); + } + + public MetricEntity( + @Nonnull String metricName, + @Nonnull MetricType metricType, + @Nonnull MetricUnit unit, + @Nonnull String description, + @Nullable Set dimensionsList) { + Validate.notEmpty(metricName, "Metric name cannot be null or empty"); + this.metricName = metricName; + this.metricType = metricType; + this.unit = unit; + this.description = description; + this.dimensionsList = dimensionsList; + } + + @Nonnull + public String getMetricName() { + return metricName; + } + + @Nonnull + public MetricType getMetricType() { + return metricType; + } + + @Nonnull + public MetricUnit getUnit() { + return unit; + } + + @Nonnull + public String getDescription() { + return description; + } + + @Nullable + public Set getDimensionsList() { + return dimensionsList; + } +} diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityState.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityState.java new file mode 100644 index 0000000000..2f9a823122 --- /dev/null +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityState.java @@ -0,0 +1,125 @@ +package com.linkedin.venice.stats.metrics; + +import com.linkedin.venice.stats.VeniceOpenTelemetryMetricsRepository; +import io.opentelemetry.api.common.Attributes; +import io.opentelemetry.api.metrics.DoubleHistogram; +import io.opentelemetry.api.metrics.LongCounter; +import io.tehuti.metrics.MeasurableStat; +import io.tehuti.metrics.Sensor; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + + +/** + * Operational state of a metric. It holds
+ * 1. {@link MetricEntity} + * 2. 1 Otel Instrument and + * 3. multiple tehuti Sensors for this Otel Metric + */ +public class MetricEntityState { + private MetricEntity metricEntity; + /** Otel metric */ + private Object otelMetric = null; + /** Map of tehuti names and sensors: 1 Otel metric can cover multiple Tehuti sensors */ + private Map tehutiSensors = null; + + public MetricEntityState(MetricEntity metricEntity, VeniceOpenTelemetryMetricsRepository otelRepository) { + this.metricEntity = metricEntity; + setOtelMetric(otelRepository.createInstrument(this.metricEntity)); + } + + public MetricEntityState( + MetricEntity metricEntity, + VeniceOpenTelemetryMetricsRepository otelRepository, + TehutiSensorRegistrationFunction registerTehutiSensor, + Map> tehutiMetricInput) { + this.metricEntity = metricEntity; + createMetric(otelRepository, tehutiMetricInput, registerTehutiSensor); + } + + public void setOtelMetric(Object otelMetric) { + this.otelMetric = otelMetric; + } + + /** + * Add Tehuti {@link Sensor} to tehutiSensors map and throw exception if sensor with same name already exists + */ + public void addTehutiSensors(TehutiMetricNameEnum name, Sensor tehutiSensor) { + if (tehutiSensors == null) { + tehutiSensors = new HashMap<>(); + } + if (tehutiSensors.put(name, tehutiSensor) != null) { + throw new IllegalArgumentException("Sensor with name '" + name + "' already exists."); + } + } + + /** + * create the metrics/Sensors + */ + @FunctionalInterface + public interface TehutiSensorRegistrationFunction { + Sensor register(String sensorName, MeasurableStat... stats); + } + + public void createMetric( + VeniceOpenTelemetryMetricsRepository otelRepository, + Map> tehutiMetricInput, + TehutiSensorRegistrationFunction registerTehutiSensor) { + // Otel metric: otelRepository will be null if otel is not enabled + if (otelRepository != null) { + setOtelMetric(otelRepository.createInstrument(this.metricEntity)); + } + // tehuti metric + for (Map.Entry> entry: tehutiMetricInput.entrySet()) { + addTehutiSensors( + entry.getKey(), + registerTehutiSensor + .register(entry.getKey().getMetricName(), entry.getValue().toArray(new MeasurableStat[0]))); + } + } + + /** + * Record otel metrics + */ + void recordOtelMetric(double value, Attributes otelDimensions) { + if (otelMetric != null) { + MetricType metricType = this.metricEntity.getMetricType(); + switch (metricType) { + case HISTOGRAM: + case MIN_MAX_COUNT_SUM_AGGREGATIONS: + ((DoubleHistogram) otelMetric).record(value, otelDimensions); + break; + case COUNTER: + ((LongCounter) otelMetric).add((long) value, otelDimensions); + break; + + default: + throw new IllegalArgumentException("Unsupported metric type: " + metricType); + } + } + } + + void recordTehutiMetric(TehutiMetricNameEnum tehutiMetricNameEnum, double value) { + if (tehutiSensors != null) { + Sensor sensor = tehutiSensors.get(tehutiMetricNameEnum); + if (sensor != null) { + sensor.record(value); + } + } + } + + public void record(TehutiMetricNameEnum tehutiMetricNameEnum, long value, Attributes otelDimensions) { + recordOtelMetric(value, otelDimensions); + recordTehutiMetric(tehutiMetricNameEnum, value); + } + + public void record(TehutiMetricNameEnum tehutiMetricNameEnum, double value, Attributes otelDimensions) { + recordOtelMetric(value, otelDimensions); + recordTehutiMetric(tehutiMetricNameEnum, value); + } + + Map getTehutiSensors() { + return tehutiSensors; + } +} diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricType.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricType.java new file mode 100644 index 0000000000..58213c292a --- /dev/null +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricType.java @@ -0,0 +1,30 @@ +package com.linkedin.venice.stats.metrics; + +import com.linkedin.venice.stats.VeniceMetricsConfig; +import com.linkedin.venice.stats.VeniceOpenTelemetryMetricsRepository; + + +/** + * Metric type enum to define the type of metrics Venice supports via OpenTelemetry + */ +public enum MetricType { + /** + * Use Histogram to get percentiles/min/max/count/sum and other aggregates: can be configured to + * be exponential or explicit bucket
+ * check {@link VeniceMetricsConfig.Builder#extractAndSetOtelConfigs} for more details + */ + HISTOGRAM, + + /** + * To get min/max/count/sum aggregation without the memory overhead to calculate percentiles, use + * Otel Explicit bucket Histogram but without buckets . + * check {@link VeniceOpenTelemetryMetricsRepository#createHistogram} and + * {@link VeniceOpenTelemetryMetricsRepository#setExponentialHistogramAggregation} for more details + */ + MIN_MAX_COUNT_SUM_AGGREGATIONS, + + /** + * For Counter: A simple counter that can be added to. + */ + COUNTER; +} diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricUnit.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricUnit.java new file mode 100644 index 0000000000..a90ef5bd0f --- /dev/null +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricUnit.java @@ -0,0 +1,8 @@ +package com.linkedin.venice.stats.metrics; + +/** + * Metric Unit enum to define list of Units supported for metrics + */ +public enum MetricUnit { + NUMBER, MILLISECOND +} diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/TehutiMetricNameEnum.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/TehutiMetricNameEnum.java new file mode 100644 index 0000000000..27bc937b16 --- /dev/null +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/TehutiMetricNameEnum.java @@ -0,0 +1,8 @@ +package com.linkedin.venice.stats.metrics; + +/** + * Interface for creating metric names enum for tehuti metrics + */ +public interface TehutiMetricNameEnum { + String getMetricName(); +} diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/utils/VeniceProperties.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/utils/VeniceProperties.java index e9855ece93..c3f50adc90 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/utils/VeniceProperties.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/utils/VeniceProperties.java @@ -458,4 +458,8 @@ public Properties toProperties() { public boolean isEmpty() { return this.props.isEmpty(); } + + public Map getAsMap() { + return props; + } } diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/utils/metrics/MetricsRepositoryUtils.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/utils/metrics/MetricsRepositoryUtils.java index b95502ab31..2d274f0c9d 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/utils/metrics/MetricsRepositoryUtils.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/utils/metrics/MetricsRepositoryUtils.java @@ -1,5 +1,7 @@ package com.linkedin.venice.utils.metrics; +import com.linkedin.venice.stats.VeniceMetricsConfig; +import com.linkedin.venice.stats.VeniceMetricsRepository; import io.tehuti.metrics.MetricConfig; import io.tehuti.metrics.MetricsRepository; import io.tehuti.metrics.stats.AsyncGauge; @@ -20,15 +22,33 @@ public static MetricsRepository createSingleThreadedMetricsRepository() { return createSingleThreadedMetricsRepository(TimeUnit.MINUTES.toMillis(1), 100); } + public static MetricsRepository createSingleThreadedVeniceMetricsRepository() { + return createSingleThreadedVeniceMetricsRepository(TimeUnit.MINUTES.toMillis(1), 100); + } + + public static MetricConfig getMetricConfig( + long maxMetricsMeasurementTimeoutMs, + long initialMetricsMeasurementTimeoutMs) { + return new MetricConfig( + new AsyncGauge.AsyncGaugeExecutor.Builder().setMetricMeasurementThreadCount(1) + .setSlowMetricMeasurementThreadCount(1) + .setInitialMetricsMeasurementTimeoutInMs(initialMetricsMeasurementTimeoutMs) + .setMaxMetricsMeasurementTimeoutInMs(maxMetricsMeasurementTimeoutMs) + .build()); + } + public static MetricsRepository createSingleThreadedMetricsRepository( long maxMetricsMeasurementTimeoutMs, long initialMetricsMeasurementTimeoutMs) { - return new MetricsRepository( - new MetricConfig( - new AsyncGauge.AsyncGaugeExecutor.Builder().setMetricMeasurementThreadCount(1) - .setSlowMetricMeasurementThreadCount(1) - .setInitialMetricsMeasurementTimeoutInMs(initialMetricsMeasurementTimeoutMs) - .setMaxMetricsMeasurementTimeoutInMs(maxMetricsMeasurementTimeoutMs) - .build())); + return new MetricsRepository(getMetricConfig(maxMetricsMeasurementTimeoutMs, initialMetricsMeasurementTimeoutMs)); + } + + public static MetricsRepository createSingleThreadedVeniceMetricsRepository( + long maxMetricsMeasurementTimeoutMs, + long initialMetricsMeasurementTimeoutMs) { + return new VeniceMetricsRepository( + new VeniceMetricsConfig.Builder() + .setTehutiMetricConfig(getMetricConfig(maxMetricsMeasurementTimeoutMs, initialMetricsMeasurementTimeoutMs)) + .build()); } } diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/AbstractVeniceAggStatsTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/AbstractVeniceAggStatsTest.java new file mode 100644 index 0000000000..98d3721600 --- /dev/null +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/AbstractVeniceAggStatsTest.java @@ -0,0 +1,37 @@ +package com.linkedin.venice.stats; + +import static org.mockito.Mockito.mock; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.fail; + +import com.linkedin.venice.client.stats.ClientStats; +import io.tehuti.metrics.MetricsRepository; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + + +public class AbstractVeniceAggStatsTest { + @DataProvider(name = "ClusterName-And-Boolean") + public Object[][] fcRequestTypes() { + return new Object[][] { { null, false }, { null, true }, { "test-cluster", false }, { "test-cluster", true } }; + } + + @Test(dataProvider = "ClusterName-And-Boolean") + public void abstractVeniceAggStatsWithNoClusterName(String clusterName, boolean perClusterAggregate) { + MetricsRepository metricsRepository = mock(MetricsRepository.class); + StatsSupplier statsSupplier = mock(StatsSupplier.class); + try { + new AbstractVeniceAggStats(clusterName, metricsRepository, statsSupplier, perClusterAggregate) { + }; + if (clusterName == null && perClusterAggregate) { + fail("Expected IllegalArgumentException"); + } + } catch (IllegalArgumentException e) { + if (clusterName == null && perClusterAggregate) { + assertEquals(e.getMessage(), "perClusterAggregate cannot be true when clusterName is null"); + } else { + fail("IllegalArgumentException not expected"); + } + } + } +} diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceMetricsConfigTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceMetricsConfigTest.java new file mode 100644 index 0000000000..e7f04ff205 --- /dev/null +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceMetricsConfigTest.java @@ -0,0 +1,201 @@ +package com.linkedin.venice.stats; + +import static com.linkedin.venice.stats.VeniceMetricsConfig.OTEL_EXPORTER_OTLP_METRICS_DEFAULT_HISTOGRAM_AGGREGATION; +import static com.linkedin.venice.stats.VeniceMetricsConfig.OTEL_EXPORTER_OTLP_METRICS_DEFAULT_HISTOGRAM_AGGREGATION_MAX_BUCKETS; +import static com.linkedin.venice.stats.VeniceMetricsConfig.OTEL_EXPORTER_OTLP_METRICS_DEFAULT_HISTOGRAM_AGGREGATION_MAX_SCALE; +import static com.linkedin.venice.stats.VeniceMetricsConfig.OTEL_EXPORTER_OTLP_METRICS_ENDPOINT; +import static com.linkedin.venice.stats.VeniceMetricsConfig.OTEL_EXPORTER_OTLP_METRICS_PROTOCOL; +import static com.linkedin.venice.stats.VeniceMetricsConfig.OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE; +import static com.linkedin.venice.stats.VeniceMetricsConfig.OTEL_VENICE_METRICS_CUSTOM_DIMENSIONS_MAP; +import static com.linkedin.venice.stats.VeniceMetricsConfig.OTEL_VENICE_METRICS_ENABLED; +import static com.linkedin.venice.stats.VeniceMetricsConfig.OTEL_VENICE_METRICS_EXPORT_TO_ENDPOINT; +import static com.linkedin.venice.stats.VeniceMetricsConfig.OTEL_VENICE_METRICS_EXPORT_TO_LOG; +import static com.linkedin.venice.stats.VeniceMetricsConfig.OTEL_VENICE_METRICS_NAMING_FORMAT; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertTrue; + +import com.linkedin.venice.stats.VeniceMetricsConfig.Builder; +import io.opentelemetry.exporter.otlp.internal.OtlpConfigUtil; +import io.opentelemetry.sdk.metrics.export.AggregationTemporalitySelector; +import io.tehuti.metrics.MetricConfig; +import java.util.HashMap; +import java.util.Map; +import org.testng.annotations.Test; + + +public class VeniceMetricsConfigTest { + @Test + public void testDefaultValues() { + new Builder().build(); + } + + @Test + public void testDefaultValuesWithBasicConfig() { + VeniceMetricsConfig config = new Builder().setServiceName("noop_service").setMetricPrefix("service").build(); + assertEquals(config.getServiceName(), "noop_service"); + assertEquals(config.getMetricPrefix(), "service"); + assertFalse(config.emitOtelMetrics()); + assertFalse(config.exportOtelMetricsToEndpoint()); + assertEquals(config.getOtelExportProtocol(), OtlpConfigUtil.PROTOCOL_HTTP_PROTOBUF); + assertEquals(config.getOtelEndpoint(), null); + assertTrue(config.getOtelHeaders().isEmpty()); + assertFalse(config.exportOtelMetricsToLog()); + assertEquals(config.getMetricNamingFormat(), VeniceOpenTelemetryMetricNamingFormat.SNAKE_CASE); + assertEquals(config.getOtelAggregationTemporalitySelector(), AggregationTemporalitySelector.deltaPreferred()); + assertEquals(config.useOtelExponentialHistogram(), true); + assertEquals(config.getOtelExponentialHistogramMaxScale(), 3); + assertEquals(config.getOtelExponentialHistogramMaxBuckets(), 250); + assertNotNull(config.getTehutiMetricConfig()); + } + + @Test + public void testCustomValues() { + Map otelConfigs = new HashMap<>(); + otelConfigs.put(OTEL_VENICE_METRICS_ENABLED, "true"); + otelConfigs.put(OTEL_VENICE_METRICS_EXPORT_TO_LOG, "true"); + + MetricConfig metricConfig = new MetricConfig(); + + VeniceMetricsConfig config = new Builder().setServiceName("TestService") + .setMetricPrefix("TestPrefix") + .setTehutiMetricConfig(metricConfig) + .extractAndSetOtelConfigs(otelConfigs) + .build(); + + assertEquals(config.getServiceName(), "TestService"); + assertEquals(config.getMetricPrefix(), "TestPrefix"); + assertTrue(config.emitOtelMetrics()); + assertTrue(config.exportOtelMetricsToLog()); + assertEquals(config.getTehutiMetricConfig(), metricConfig); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testOtelMissingConfigs() { + Map invalidOtelConfigs = new HashMap<>(); + invalidOtelConfigs.put(OTEL_VENICE_METRICS_ENABLED, "true"); + invalidOtelConfigs.put(OTEL_VENICE_METRICS_EXPORT_TO_ENDPOINT, "true"); + + new Builder().setServiceName("TestService") + .setMetricPrefix("TestPrefix") + .extractAndSetOtelConfigs(invalidOtelConfigs) + .build(); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testOtelConfigWithInvalidMetricFormat() { + Map otelConfigs = new HashMap<>(); + otelConfigs.put(OTEL_VENICE_METRICS_NAMING_FORMAT, "INVALID_FORMAT"); + + new Builder().extractAndSetOtelConfigs(otelConfigs).build(); + } + + @Test + public void testOtelConfigWithValidMetricFormat() { + Map otelConfigs = new HashMap<>(); + otelConfigs.put(OTEL_VENICE_METRICS_ENABLED, "true"); + otelConfigs.put(OTEL_VENICE_METRICS_NAMING_FORMAT, "CAMEL_CASE"); + + VeniceMetricsConfig config = new Builder().setServiceName("TestService") + .setMetricPrefix("TestPrefix") + .extractAndSetOtelConfigs(otelConfigs) + .build(); + + assertEquals(config.getMetricNamingFormat(), VeniceOpenTelemetryMetricNamingFormat.CAMEL_CASE); + } + + @Test + public void testEnableHttpGrpcEndpointConfigWithRequiredFields() { + Map otelConfigs = new HashMap<>(); + otelConfigs.put(OTEL_VENICE_METRICS_ENABLED, "true"); + otelConfigs.put(OTEL_VENICE_METRICS_EXPORT_TO_ENDPOINT, "true"); + otelConfigs.put(OTEL_EXPORTER_OTLP_METRICS_PROTOCOL, OtlpConfigUtil.PROTOCOL_HTTP_PROTOBUF); + otelConfigs.put(OTEL_EXPORTER_OTLP_METRICS_ENDPOINT, "http://localhost"); + + VeniceMetricsConfig config = new Builder().setServiceName("TestService") + .setMetricPrefix("TestPrefix") + .extractAndSetOtelConfigs(otelConfigs) + .build(); + + assertTrue(config.exportOtelMetricsToEndpoint()); + assertEquals(config.getOtelExportProtocol(), OtlpConfigUtil.PROTOCOL_HTTP_PROTOBUF); + assertEquals(config.getOtelEndpoint(), "http://localhost"); + } + + @Test + public void testSetAggregationTemporalitySelector() { + Map otelConfigs = new HashMap<>(); + otelConfigs.put(OTEL_VENICE_METRICS_ENABLED, "true"); + otelConfigs.put(OTEL_VENICE_METRICS_EXPORT_TO_ENDPOINT, "true"); + otelConfigs.put(OTEL_EXPORTER_OTLP_METRICS_PROTOCOL, OtlpConfigUtil.PROTOCOL_HTTP_PROTOBUF); + otelConfigs.put(OTEL_EXPORTER_OTLP_METRICS_ENDPOINT, "http://localhost"); + otelConfigs.put(OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE, "delta"); + + VeniceMetricsConfig config = new Builder().setServiceName("TestService") + .setMetricPrefix("TestPrefix") + .extractAndSetOtelConfigs(otelConfigs) + .build(); + assertEquals(config.getOtelAggregationTemporalitySelector(), AggregationTemporalitySelector.deltaPreferred()); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testSetAggregationTemporalitySelectorInvalidConfig() { + Map otelConfigs = new HashMap<>(); + otelConfigs.put(OTEL_VENICE_METRICS_ENABLED, "true"); + otelConfigs.put(OTEL_VENICE_METRICS_EXPORT_TO_ENDPOINT, "true"); + otelConfigs.put(OTEL_EXPORTER_OTLP_METRICS_PROTOCOL, OtlpConfigUtil.PROTOCOL_HTTP_PROTOBUF); + otelConfigs.put(OTEL_EXPORTER_OTLP_METRICS_ENDPOINT, "http://localhost"); + otelConfigs.put(OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE, "invalid"); + + VeniceMetricsConfig config = new Builder().setServiceName("TestService") + .setMetricPrefix("TestPrefix") + .extractAndSetOtelConfigs(otelConfigs) + .build(); + assertEquals(config.getOtelAggregationTemporalitySelector(), AggregationTemporalitySelector.deltaPreferred()); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testSetHistogramAggregationSelectorInvalidConfig() { + Map otelConfigs = new HashMap<>(); + otelConfigs.put(OTEL_VENICE_METRICS_ENABLED, "true"); + otelConfigs.put(OTEL_VENICE_METRICS_EXPORT_TO_ENDPOINT, "true"); + otelConfigs.put(OTEL_EXPORTER_OTLP_METRICS_PROTOCOL, OtlpConfigUtil.PROTOCOL_HTTP_PROTOBUF); + otelConfigs.put(OTEL_EXPORTER_OTLP_METRICS_ENDPOINT, "http://localhost"); + otelConfigs.put(OTEL_EXPORTER_OTLP_METRICS_DEFAULT_HISTOGRAM_AGGREGATION, "invalid"); + otelConfigs.put(OTEL_EXPORTER_OTLP_METRICS_DEFAULT_HISTOGRAM_AGGREGATION_MAX_SCALE, "10"); + otelConfigs.put(OTEL_EXPORTER_OTLP_METRICS_DEFAULT_HISTOGRAM_AGGREGATION_MAX_BUCKETS, "50"); + + new Builder().setServiceName("TestService") + .setMetricPrefix("TestPrefix") + .extractAndSetOtelConfigs(otelConfigs) + .build(); + } + + @Test + public void testSetOtelCustomDimensionsMap() { + Map otelConfigs = new HashMap<>(); + otelConfigs.put(OTEL_VENICE_METRICS_ENABLED, "true"); + otelConfigs.put(OTEL_VENICE_METRICS_EXPORT_TO_ENDPOINT, "false"); + otelConfigs.put(OTEL_VENICE_METRICS_CUSTOM_DIMENSIONS_MAP, "key1=value1,key2=value2"); + VeniceMetricsConfig config = new Builder().setServiceName("TestService") + .setMetricPrefix("TestPrefix") + .extractAndSetOtelConfigs(otelConfigs) + .build(); + assertEquals(config.getOtelCustomDimensionsMap().size(), 2); + assertEquals(config.getOtelCustomDimensionsMap().get("key1"), "value1"); + assertEquals(config.getOtelCustomDimensionsMap().get("key2"), "value2"); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testSetOtelCustomDimensionsMapWithInvalidValue() { + Map otelConfigs = new HashMap<>(); + otelConfigs.put(OTEL_VENICE_METRICS_ENABLED, "true"); + otelConfigs.put(OTEL_VENICE_METRICS_EXPORT_TO_ENDPOINT, "false"); + otelConfigs.put(OTEL_VENICE_METRICS_CUSTOM_DIMENSIONS_MAP, "key1=value1,key2=value2=3"); + new Builder().setServiceName("TestService") + .setMetricPrefix("TestPrefix") + .extractAndSetOtelConfigs(otelConfigs) + .build(); + } +} diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceMetricsRepositoryTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceMetricsRepositoryTest.java new file mode 100644 index 0000000000..70f3e50702 --- /dev/null +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceMetricsRepositoryTest.java @@ -0,0 +1,51 @@ +package com.linkedin.venice.stats; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotNull; + +import io.tehuti.metrics.MetricConfig; +import org.mockito.Mockito; +import org.testng.annotations.Test; + + +public class VeniceMetricsRepositoryTest { + @Test + public void testDefaultConstructor() throws Exception { + VeniceMetricsRepository repository = new VeniceMetricsRepository(); + assertNotNull(repository.getVeniceMetricsConfig(), "VeniceMetricsConfig should not be null."); + assertNotNull(repository.getOpenTelemetryMetricsRepository(), "OpenTelemetryMetricsRepository should not be null."); + repository.close(); + } + + @Test + public void testConstructorWithAllParameters() { + VeniceMetricsConfig metricsConfig = new VeniceMetricsConfig.Builder().build(); + VeniceOpenTelemetryMetricsRepository openTelemetryMetricsRepository = + new VeniceOpenTelemetryMetricsRepository(metricsConfig); + VeniceMetricsRepository repository = new VeniceMetricsRepository(metricsConfig, openTelemetryMetricsRepository); + + assertEquals( + repository.getVeniceMetricsConfig(), + metricsConfig, + "VeniceMetricsConfig should match the provided config."); + assertEquals( + repository.getOpenTelemetryMetricsRepository(), + openTelemetryMetricsRepository, + "OpenTelemetryMetricsRepository should match the provided instance."); + repository.close(); + } + + @Test + public void testCloseMethod() { + VeniceMetricsConfig mockConfig = Mockito.mock(VeniceMetricsConfig.class); + VeniceOpenTelemetryMetricsRepository mockOpenTelemetryRepository = + Mockito.mock(VeniceOpenTelemetryMetricsRepository.class); + Mockito.when(mockConfig.getTehutiMetricConfig()).thenReturn(new MetricConfig()); + + VeniceMetricsRepository repository = new VeniceMetricsRepository(mockConfig, mockOpenTelemetryRepository); + repository.close(); + + // Verify that close methods are called + Mockito.verify(mockOpenTelemetryRepository).close(); + } +} diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricsRepositoryTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricsRepositoryTest.java new file mode 100644 index 0000000000..7d24139d10 --- /dev/null +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricsRepositoryTest.java @@ -0,0 +1,126 @@ +package com.linkedin.venice.stats; + +import static com.linkedin.venice.stats.VeniceOpenTelemetryMetricNamingFormat.transformMetricName; +import static com.linkedin.venice.stats.VeniceOpenTelemetryMetricNamingFormat.validateMetricName; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertNull; +import static org.testng.Assert.assertSame; + +import com.linkedin.venice.stats.metrics.MetricEntity; +import com.linkedin.venice.stats.metrics.MetricType; +import com.linkedin.venice.stats.metrics.MetricUnit; +import io.opentelemetry.api.metrics.DoubleHistogram; +import io.opentelemetry.api.metrics.LongCounter; +import io.opentelemetry.sdk.metrics.export.MetricExporter; +import org.mockito.Mockito; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + + +public class VeniceOpenTelemetryMetricsRepositoryTest { + private VeniceOpenTelemetryMetricsRepository metricsRepository; + + private VeniceMetricsConfig mockMetricsConfig; + + @BeforeMethod + public void setUp() { + mockMetricsConfig = Mockito.mock(VeniceMetricsConfig.class); + Mockito.when(mockMetricsConfig.emitOtelMetrics()).thenReturn(true); + Mockito.when(mockMetricsConfig.getMetricNamingFormat()) + .thenReturn(VeniceOpenTelemetryMetricNamingFormat.SNAKE_CASE); + Mockito.when(mockMetricsConfig.getMetricPrefix()).thenReturn("test_prefix"); + Mockito.when(mockMetricsConfig.getServiceName()).thenReturn("test_service"); + Mockito.when(mockMetricsConfig.exportOtelMetricsToEndpoint()).thenReturn(true); + Mockito.when(mockMetricsConfig.getOtelEndpoint()).thenReturn("http://localhost:4318"); + + metricsRepository = new VeniceOpenTelemetryMetricsRepository(mockMetricsConfig); + } + + @AfterMethod + public void tearDown() { + metricsRepository.close(); + } + + @Test + public void testConstructorInitialize() { + // Check if OpenTelemetry and SdkMeterProvider are initialized correctly + assertNotNull(metricsRepository.getSdkMeterProvider()); + assertNotNull(metricsRepository.getMeter()); + } + + @Test + public void testConstructorWithEmitDisabled() { + Mockito.when(mockMetricsConfig.emitOtelMetrics()).thenReturn(false); + VeniceOpenTelemetryMetricsRepository metricsRepository = + new VeniceOpenTelemetryMetricsRepository(mockMetricsConfig); + + // Verify that metrics-related fields are null when metrics are disabled + assertNull(metricsRepository.getSdkMeterProvider()); + assertNull(metricsRepository.getMeter()); + assertNull( + metricsRepository.createInstrument(new MetricEntity("test", MetricType.HISTOGRAM, MetricUnit.NUMBER, "desc"))); + assertNull( + metricsRepository.createInstrument(new MetricEntity("test", MetricType.COUNTER, MetricUnit.NUMBER, "desc"))); + } + + @Test + public void testGetOtlpHttpMetricExporterWithValidConfig() { + MetricExporter exporter = metricsRepository.getOtlpHttpMetricExporter(mockMetricsConfig); + + // Verify that the exporter is not null + assertNotNull(exporter); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testValidateMetricNameWithNullName() { + validateMetricName(null); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testValidateMetricNameWithEmptyName() { + validateMetricName(""); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testValidateMetricNameWithInvalidName() { + validateMetricName("Invalid Name!"); + } + + @Test + public void testTransformMetricName() { + Mockito.when(mockMetricsConfig.getMetricNamingFormat()) + .thenReturn(VeniceOpenTelemetryMetricNamingFormat.SNAKE_CASE); + assertEquals(metricsRepository.getFullMetricName("prefix", "metric_name"), "prefix.metric_name"); + + String transformedName = + transformMetricName("test.test_metric_name", VeniceOpenTelemetryMetricNamingFormat.PASCAL_CASE); + assertEquals(transformedName, "Test.TestMetricName"); + + transformedName = transformMetricName("test.test_metric_name", VeniceOpenTelemetryMetricNamingFormat.CAMEL_CASE); + assertEquals(transformedName, "test.testMetricName"); + } + + @Test + public void testCreateTwoHistograms() { + DoubleHistogram histogram1 = (DoubleHistogram) metricsRepository + .createInstrument(new MetricEntity("test_histogram", MetricType.HISTOGRAM, MetricUnit.NUMBER, "desc")); + DoubleHistogram histogram2 = (DoubleHistogram) metricsRepository + .createInstrument(new MetricEntity("test_histogram", MetricType.HISTOGRAM, MetricUnit.NUMBER, "desc")); + + assertNotNull(histogram1); + assertSame(histogram1, histogram2, "Should return the same instance for the same histogram name."); + } + + @Test + public void testCreateTwoCounters() { + LongCounter counter1 = (LongCounter) metricsRepository + .createInstrument(new MetricEntity("test_counter", MetricType.COUNTER, MetricUnit.NUMBER, "desc")); + LongCounter counter2 = (LongCounter) metricsRepository + .createInstrument(new MetricEntity("test_counter", MetricType.COUNTER, MetricUnit.NUMBER, "desc")); + + assertNotNull(counter1); + assertSame(counter1, counter2, "Should return the same instance for the same counter name."); + } +} diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/HttpResponseStatusCodeCategoryTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/HttpResponseStatusCodeCategoryTest.java new file mode 100644 index 0000000000..9a60bcfacd --- /dev/null +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/HttpResponseStatusCodeCategoryTest.java @@ -0,0 +1,25 @@ +package com.linkedin.venice.stats.dimensions; + +import static com.linkedin.venice.stats.dimensions.HttpResponseStatusCodeCategory.getVeniceHttpResponseStatusCodeCategory; +import static org.testng.Assert.assertEquals; + +import io.netty.handler.codec.http.HttpResponseStatus; +import org.testng.annotations.Test; + + +public class HttpResponseStatusCodeCategoryTest { + @Test() + public void testValues() { + assertEquals(getVeniceHttpResponseStatusCodeCategory(HttpResponseStatus.PROCESSING), "1xx"); + assertEquals(getVeniceHttpResponseStatusCodeCategory(HttpResponseStatus.OK), "2xx"); + assertEquals(getVeniceHttpResponseStatusCodeCategory(HttpResponseStatus.MOVED_PERMANENTLY), "3xx"); + assertEquals(getVeniceHttpResponseStatusCodeCategory(HttpResponseStatus.BAD_REQUEST), "4xx"); + assertEquals(getVeniceHttpResponseStatusCodeCategory(HttpResponseStatus.INTERNAL_SERVER_ERROR), "5xx"); + } + + @Test + public void testUnknownCategory() { + assertEquals(getVeniceHttpResponseStatusCodeCategory(HttpResponseStatus.valueOf(99)), "unknown"); + assertEquals(getVeniceHttpResponseStatusCodeCategory(HttpResponseStatus.valueOf(600)), "unknown"); + } +} diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/RequestRetryAbortReasonTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/RequestRetryAbortReasonTest.java new file mode 100644 index 0000000000..f7f39d9b16 --- /dev/null +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/RequestRetryAbortReasonTest.java @@ -0,0 +1,30 @@ +package com.linkedin.venice.stats.dimensions; + +import static org.testng.Assert.assertEquals; + +import org.testng.annotations.Test; + + +public class RequestRetryAbortReasonTest { + @Test + public void testRetryRequestAbortReason() { + for (RequestRetryAbortReason reason: RequestRetryAbortReason.values()) { + switch (reason) { + case SLOW_ROUTE: + assertEquals(reason.getAbortReason(), "slow_route"); + break; + case DELAY_CONSTRAINT: + assertEquals(reason.getAbortReason(), "delay_constraint"); + break; + case MAX_RETRY_ROUTE_LIMIT: + assertEquals(reason.getAbortReason(), "max_retry_route_limit"); + break; + case NO_AVAILABLE_REPLICA: + assertEquals(reason.getAbortReason(), "no_available_replica"); + break; + default: + throw new IllegalArgumentException("Unknown reason: " + reason); + } + } + } +} diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/RequestRetryTypeTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/RequestRetryTypeTest.java new file mode 100644 index 0000000000..2141a93a88 --- /dev/null +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/RequestRetryTypeTest.java @@ -0,0 +1,24 @@ +package com.linkedin.venice.stats.dimensions; + +import static org.testng.Assert.assertEquals; + +import org.testng.annotations.Test; + + +public class RequestRetryTypeTest { + @Test + public void testVeniceRequestRetryType() { + for (RequestRetryType retryType: RequestRetryType.values()) { + switch (retryType) { + case ERROR_RETRY: + assertEquals(retryType.getRetryType(), "error_retry"); + break; + case LONG_TAIL_RETRY: + assertEquals(retryType.getRetryType(), "long_tail_retry"); + break; + default: + throw new IllegalArgumentException("Unknown retry type: " + retryType); + } + } + } +} diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/RequestValidationOutcomeTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/RequestValidationOutcomeTest.java new file mode 100644 index 0000000000..f144850be7 --- /dev/null +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/RequestValidationOutcomeTest.java @@ -0,0 +1,24 @@ +package com.linkedin.venice.stats.dimensions; + +import static org.testng.Assert.assertEquals; + +import org.testng.annotations.Test; + + +public class RequestValidationOutcomeTest { + @Test + public void testVeniceRequestValidationOutcome() { + for (RequestValidationOutcome outcome: RequestValidationOutcome.values()) { + switch (outcome) { + case VALID: + assertEquals(outcome.getOutcome(), "valid"); + break; + case INVALID_KEY_COUNT_LIMIT_EXCEEDED: + assertEquals(outcome.getOutcome(), "invalid_key_count_limit_exceeded"); + break; + default: + throw new IllegalArgumentException("Unknown outcome: " + outcome); + } + } + } +} diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceMetricsDimensionsTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceMetricsDimensionsTest.java new file mode 100644 index 0000000000..b7442d60b6 --- /dev/null +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceMetricsDimensionsTest.java @@ -0,0 +1,123 @@ +package com.linkedin.venice.stats.dimensions; + +import static org.testng.Assert.assertEquals; + +import com.linkedin.venice.stats.VeniceOpenTelemetryMetricNamingFormat; +import org.testng.annotations.Test; + + +public class VeniceMetricsDimensionsTest { + @Test + public void testGetDimensionNameInSnakeCase() { + VeniceOpenTelemetryMetricNamingFormat format = VeniceOpenTelemetryMetricNamingFormat.SNAKE_CASE; + for (VeniceMetricsDimensions dimension: VeniceMetricsDimensions.values()) { + switch (dimension) { + case VENICE_STORE_NAME: + assertEquals(dimension.getDimensionName(format), "venice.store.name"); + break; + case VENICE_CLUSTER_NAME: + assertEquals(dimension.getDimensionName(format), "venice.cluster.name"); + break; + case VENICE_REQUEST_METHOD: + assertEquals(dimension.getDimensionName(format), "venice.request.method"); + break; + case HTTP_RESPONSE_STATUS_CODE: + assertEquals(dimension.getDimensionName(format), "http.response.status_code"); + break; + case HTTP_RESPONSE_STATUS_CODE_CATEGORY: + assertEquals(dimension.getDimensionName(format), "http.response.status_code_category"); + break; + case VENICE_REQUEST_VALIDATION_OUTCOME: + assertEquals(dimension.getDimensionName(format), "venice.request.validation_outcome"); + break; + case VENICE_RESPONSE_STATUS_CODE_CATEGORY: + assertEquals(dimension.getDimensionName(format), "venice.response.status_code_category"); + break; + case VENICE_REQUEST_RETRY_TYPE: + assertEquals(dimension.getDimensionName(format), "venice.request.retry_type"); + break; + case VENICE_REQUEST_RETRY_ABORT_REASON: + assertEquals(dimension.getDimensionName(format), "venice.request.retry_abort_reason"); + break; + default: + throw new IllegalArgumentException("Unknown dimension: " + dimension); + } + } + } + + @Test + public void testGetDimensionNameInCamelCase() { + VeniceOpenTelemetryMetricNamingFormat format = VeniceOpenTelemetryMetricNamingFormat.CAMEL_CASE; + for (VeniceMetricsDimensions dimension: VeniceMetricsDimensions.values()) { + switch (dimension) { + case VENICE_STORE_NAME: + assertEquals(dimension.getDimensionName(format), "venice.store.name"); + break; + case VENICE_CLUSTER_NAME: + assertEquals(dimension.getDimensionName(format), "venice.cluster.name"); + break; + case VENICE_REQUEST_METHOD: + assertEquals(dimension.getDimensionName(format), "venice.request.method"); + break; + case HTTP_RESPONSE_STATUS_CODE: + assertEquals(dimension.getDimensionName(format), "http.response.statusCode"); + break; + case HTTP_RESPONSE_STATUS_CODE_CATEGORY: + assertEquals(dimension.getDimensionName(format), "http.response.statusCodeCategory"); + break; + case VENICE_REQUEST_VALIDATION_OUTCOME: + assertEquals(dimension.getDimensionName(format), "venice.request.validationOutcome"); + break; + case VENICE_RESPONSE_STATUS_CODE_CATEGORY: + assertEquals(dimension.getDimensionName(format), "venice.response.statusCodeCategory"); + break; + case VENICE_REQUEST_RETRY_TYPE: + assertEquals(dimension.getDimensionName(format), "venice.request.retryType"); + break; + case VENICE_REQUEST_RETRY_ABORT_REASON: + assertEquals(dimension.getDimensionName(format), "venice.request.retryAbortReason"); + break; + default: + throw new IllegalArgumentException("Unknown dimension: " + dimension); + } + } + } + + @Test + public void testGetDimensionNameInPascalCase() { + VeniceOpenTelemetryMetricNamingFormat format = VeniceOpenTelemetryMetricNamingFormat.PASCAL_CASE; + for (VeniceMetricsDimensions dimension: VeniceMetricsDimensions.values()) { + switch (dimension) { + case VENICE_STORE_NAME: + assertEquals(dimension.getDimensionName(format), "Venice.Store.Name"); + break; + case VENICE_CLUSTER_NAME: + assertEquals(dimension.getDimensionName(format), "Venice.Cluster.Name"); + break; + case VENICE_REQUEST_METHOD: + assertEquals(dimension.getDimensionName(format), "Venice.Request.Method"); + break; + case HTTP_RESPONSE_STATUS_CODE: + assertEquals(dimension.getDimensionName(format), "Http.Response.StatusCode"); + break; + case HTTP_RESPONSE_STATUS_CODE_CATEGORY: + assertEquals(dimension.getDimensionName(format), "Http.Response.StatusCodeCategory"); + break; + case VENICE_REQUEST_VALIDATION_OUTCOME: + assertEquals(dimension.getDimensionName(format), "Venice.Request.ValidationOutcome"); + break; + case VENICE_RESPONSE_STATUS_CODE_CATEGORY: + assertEquals(dimension.getDimensionName(format), "Venice.Response.StatusCodeCategory"); + break; + case VENICE_REQUEST_RETRY_TYPE: + assertEquals(dimension.getDimensionName(format), "Venice.Request.RetryType"); + break; + case VENICE_REQUEST_RETRY_ABORT_REASON: + assertEquals(dimension.getDimensionName(format), "Venice.Request.RetryAbortReason"); + break; + default: + throw new IllegalArgumentException("Unknown dimension: " + dimension); + } + } + } +} diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceResponseStatusCategoryTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceResponseStatusCategoryTest.java new file mode 100644 index 0000000000..22272d3576 --- /dev/null +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceResponseStatusCategoryTest.java @@ -0,0 +1,33 @@ +package com.linkedin.venice.stats.dimensions; + +import static org.testng.Assert.assertEquals; + +import org.testng.annotations.Test; + + +public class VeniceResponseStatusCategoryTest { + @Test + public void testVeniceResponseStatusCategory() { + for (VeniceResponseStatusCategory responseStatusCategory: VeniceResponseStatusCategory.values()) { + switch (responseStatusCategory) { + case HEALTHY: + assertEquals(responseStatusCategory.getCategory(), "healthy"); + break; + case UNHEALTHY: + assertEquals(responseStatusCategory.getCategory(), "unhealthy"); + break; + case TARDY: + assertEquals(responseStatusCategory.getCategory(), "tardy"); + break; + case THROTTLED: + assertEquals(responseStatusCategory.getCategory(), "throttled"); + break; + case BAD_REQUEST: + assertEquals(responseStatusCategory.getCategory(), "bad_request"); + break; + default: + throw new IllegalArgumentException("Unknown response status category: " + responseStatusCategory); + } + } + } +} diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateTest.java new file mode 100644 index 0000000000..3e2248df96 --- /dev/null +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateTest.java @@ -0,0 +1,136 @@ +package com.linkedin.venice.stats.metrics; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import com.linkedin.venice.stats.VeniceOpenTelemetryMetricsRepository; +import io.opentelemetry.api.common.Attributes; +import io.opentelemetry.api.metrics.DoubleHistogram; +import io.opentelemetry.api.metrics.LongCounter; +import io.tehuti.metrics.MeasurableStat; +import io.tehuti.metrics.Sensor; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + + +public class MetricEntityStateTest { + private VeniceOpenTelemetryMetricsRepository mockOtelRepository; + private MetricEntity mockMetricEntity; + private MetricEntityState.TehutiSensorRegistrationFunction sensorRegistrationFunction; + private Sensor mockSensor; + + private enum TestTehutiMetricNameEnum implements TehutiMetricNameEnum { + TEST_METRIC; + + private final String metricName; + + TestTehutiMetricNameEnum() { + this.metricName = this.name().toLowerCase(); + } + + @Override + public String getMetricName() { + return this.metricName; + } + } + + @BeforeMethod + public void setUp() { + mockOtelRepository = mock(VeniceOpenTelemetryMetricsRepository.class); + mockMetricEntity = mock(MetricEntity.class); + sensorRegistrationFunction = (name, stats) -> mock(Sensor.class); + mockSensor = mock(Sensor.class); + } + + @Test + public void testCreateMetricWithOtelEnabled() { + when(mockMetricEntity.getMetricType()).thenReturn(MetricType.COUNTER); + LongCounter longCounter = mock(LongCounter.class); + when(mockOtelRepository.createInstrument(mockMetricEntity)).thenReturn(longCounter); + + Map> tehutiMetricInput = new HashMap<>(); + MetricEntityState metricEntityState = + new MetricEntityState(mockMetricEntity, mockOtelRepository, sensorRegistrationFunction, tehutiMetricInput); + + Assert.assertNotNull(metricEntityState); + Assert.assertNull(metricEntityState.getTehutiSensors()); // No Tehuti sensors added + } + + @Test + public void testAddTehutiSensorsSuccessfully() { + MetricEntityState metricEntityState = new MetricEntityState(mockMetricEntity, mockOtelRepository); + metricEntityState.addTehutiSensors(TestTehutiMetricNameEnum.TEST_METRIC, mockSensor); + + Assert.assertNotNull(metricEntityState.getTehutiSensors()); + Assert.assertTrue(metricEntityState.getTehutiSensors().containsKey(TestTehutiMetricNameEnum.TEST_METRIC)); + } + + @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = ".*Sensor with name 'TEST_METRIC' already exists.*") + public void testAddTehutiSensorThrowsExceptionOnDuplicate() { + MetricEntityState metricEntityState = new MetricEntityState(mockMetricEntity, mockOtelRepository); + metricEntityState.addTehutiSensors(TestTehutiMetricNameEnum.TEST_METRIC, mockSensor); + + // Adding the same sensor name again should throw an exception + metricEntityState.addTehutiSensors(TestTehutiMetricNameEnum.TEST_METRIC, mockSensor); + } + + @Test + public void testRecordOtelMetricHistogram() { + DoubleHistogram doubleHistogram = mock(DoubleHistogram.class); + when(mockMetricEntity.getMetricType()).thenReturn(MetricType.HISTOGRAM); + + MetricEntityState metricEntityState = new MetricEntityState(mockMetricEntity, mockOtelRepository); + metricEntityState.setOtelMetric(doubleHistogram); + + Attributes attributes = Attributes.builder().put("key", "value").build(); + metricEntityState.recordOtelMetric(5.5, attributes); + + verify(doubleHistogram, times(1)).record(5.5, attributes); + } + + @Test + public void testRecordOtelMetricCounter() { + LongCounter longCounter = mock(LongCounter.class); + when(mockMetricEntity.getMetricType()).thenReturn(MetricType.COUNTER); + + MetricEntityState metricEntityState = new MetricEntityState(mockMetricEntity, mockOtelRepository); + metricEntityState.setOtelMetric(longCounter); + + Attributes attributes = Attributes.builder().put("key", "value").build(); + metricEntityState.recordOtelMetric(10, attributes); + + verify(longCounter, times(1)).add(10, attributes); + } + + @Test + public void testRecordTehutiMetric() { + MetricEntityState metricEntityState = new MetricEntityState(mockMetricEntity, mockOtelRepository); + metricEntityState.addTehutiSensors(TestTehutiMetricNameEnum.TEST_METRIC, mockSensor); + + metricEntityState.recordTehutiMetric(TestTehutiMetricNameEnum.TEST_METRIC, 15.0); + + verify(mockSensor, times(1)).record(15.0); + } + + @Test + public void testRecordMetricsWithBothOtelAndTehuti() { + DoubleHistogram doubleHistogram = mock(DoubleHistogram.class); + when(mockMetricEntity.getMetricType()).thenReturn(MetricType.HISTOGRAM); + + MetricEntityState metricEntityState = new MetricEntityState(mockMetricEntity, mockOtelRepository); + metricEntityState.setOtelMetric(doubleHistogram); + metricEntityState.addTehutiSensors(TestTehutiMetricNameEnum.TEST_METRIC, mockSensor); + + Attributes attributes = Attributes.builder().put("key", "value").build(); + metricEntityState.record(TestTehutiMetricNameEnum.TEST_METRIC, 20.0, attributes); + + verify(doubleHistogram, times(1)).record(20.0, attributes); + verify(mockSensor, times(1)).record(20.0); + } +} diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityTest.java new file mode 100644 index 0000000000..f1fe09ff8d --- /dev/null +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityTest.java @@ -0,0 +1,55 @@ +package com.linkedin.venice.stats.metrics; + +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_CLUSTER_NAME; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_NAME; + +import com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions; +import java.util.HashSet; +import java.util.Set; +import org.testng.Assert; +import org.testng.annotations.Test; + + +public class MetricEntityTest { + @Test + public void testMetricEntityConstructorWithoutDimensions() { + String metricName = "testMetric"; + MetricType metricType = MetricType.COUNTER; + MetricUnit unit = MetricUnit.MILLISECOND; + String description = "Test description"; + + MetricEntity metricEntity = new MetricEntity(metricName, metricType, unit, description); + + Assert.assertEquals(metricEntity.getMetricName(), metricName, "Metric name should match"); + Assert.assertEquals(metricEntity.getMetricType(), metricType, "Metric type should match"); + Assert.assertEquals(metricEntity.getUnit(), unit, "Metric unit should match"); + Assert.assertEquals(metricEntity.getDescription(), description, "Description should match"); + Assert.assertNull(metricEntity.getDimensionsList(), "Dimensions list should be null"); + } + + @Test + public void testMetricEntityConstructorWithDimensions() { + String metricName = "testMetric"; + MetricType metricType = MetricType.COUNTER; + MetricUnit unit = MetricUnit.MILLISECOND; + String description = "Test description with dimensions"; + + Set dimensions = new HashSet<>(); + dimensions.add(VENICE_STORE_NAME); + dimensions.add(VENICE_CLUSTER_NAME); + + MetricEntity metricEntity = new MetricEntity(metricName, metricType, unit, description, dimensions); + + Assert.assertEquals(metricEntity.getMetricName(), metricName, "Metric name should match"); + Assert.assertEquals(metricEntity.getMetricType(), metricType, "Metric type should match"); + Assert.assertEquals(metricEntity.getUnit(), unit, "Metric unit should match"); + Assert.assertEquals(metricEntity.getDescription(), description, "Description should match"); + Assert.assertNotNull(metricEntity.getDimensionsList(), "Dimensions list should not be null"); + Assert.assertEquals(metricEntity.getDimensionsList(), dimensions, "Dimensions list should match"); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testMetricEntityConstructorWithEmptyName() { + new MetricEntity("", MetricType.COUNTER, MetricUnit.MILLISECOND, "Empty name test"); + } +} diff --git a/internal/venice-common/src/main/java/com/linkedin/venice/pushmonitor/AggPushHealthStats.java b/internal/venice-common/src/main/java/com/linkedin/venice/pushmonitor/AggPushHealthStats.java index ebf508d83f..7d7035f93c 100644 --- a/internal/venice-common/src/main/java/com/linkedin/venice/pushmonitor/AggPushHealthStats.java +++ b/internal/venice-common/src/main/java/com/linkedin/venice/pushmonitor/AggPushHealthStats.java @@ -16,7 +16,8 @@ public AggPushHealthStats( metricsRepository, PushHealthStats::new, metadataRepository, - isUnregisterMetricForDeletedStoreEnabled); + isUnregisterMetricForDeletedStoreEnabled, + true); } public void recordFailedPush(String storeName, long durationInSec) { diff --git a/internal/venice-common/src/main/java/com/linkedin/venice/pushmonitor/AggPushStatusCleanUpStats.java b/internal/venice-common/src/main/java/com/linkedin/venice/pushmonitor/AggPushStatusCleanUpStats.java index 6613c1c16a..c7e508ee74 100644 --- a/internal/venice-common/src/main/java/com/linkedin/venice/pushmonitor/AggPushStatusCleanUpStats.java +++ b/internal/venice-common/src/main/java/com/linkedin/venice/pushmonitor/AggPushStatusCleanUpStats.java @@ -16,7 +16,8 @@ public AggPushStatusCleanUpStats( metricsRepository, PushStatusCleanUpStats::new, metadataRepository, - isUnregisterMetricForDeletedStoreEnabled); + isUnregisterMetricForDeletedStoreEnabled, + true); } public void recordLeakedPushStatusCount(int count) { diff --git a/internal/venice-common/src/main/java/com/linkedin/venice/pushmonitor/PushHealthStats.java b/internal/venice-common/src/main/java/com/linkedin/venice/pushmonitor/PushHealthStats.java index a552678d23..c6cf6f405d 100644 --- a/internal/venice-common/src/main/java/com/linkedin/venice/pushmonitor/PushHealthStats.java +++ b/internal/venice-common/src/main/java/com/linkedin/venice/pushmonitor/PushHealthStats.java @@ -15,7 +15,7 @@ public class PushHealthStats extends AbstractVeniceStats { private final Sensor successfulPushDurationSensorGauge; - public PushHealthStats(MetricsRepository metricsRepository, String storeName) { + public PushHealthStats(MetricsRepository metricsRepository, String storeName, String clusterName) { super(metricsRepository, storeName); failedPushDurationSensor = registerSensorIfAbsent("failed_push_duration_sec", new Avg(), new Max()); successfulPushDurationSensor = registerSensorIfAbsent("successful_push_duration_sec", new Avg(), new Max()); diff --git a/internal/venice-common/src/main/java/com/linkedin/venice/pushmonitor/PushStatusCleanUpStats.java b/internal/venice-common/src/main/java/com/linkedin/venice/pushmonitor/PushStatusCleanUpStats.java index 35b4819775..698ba21afa 100644 --- a/internal/venice-common/src/main/java/com/linkedin/venice/pushmonitor/PushStatusCleanUpStats.java +++ b/internal/venice-common/src/main/java/com/linkedin/venice/pushmonitor/PushStatusCleanUpStats.java @@ -13,7 +13,7 @@ public class PushStatusCleanUpStats extends AbstractVeniceStats { private final Sensor leakedPushStatusCleanUpServiceStateSensor; private final Sensor leakedPushStatusCountSensor; - public PushStatusCleanUpStats(MetricsRepository metricsRepository, String storeName) { + public PushStatusCleanUpStats(MetricsRepository metricsRepository, String storeName, String clusterName) { super(metricsRepository, storeName); leakedPushStatusCountSensor = registerSensorIfAbsent("leaked_push_status_count", new Gauge()); failedLeakedPushStatusCleanUpCountSensor = diff --git a/internal/venice-common/src/main/java/com/linkedin/venice/stats/AbstractVeniceAggStoreStats.java b/internal/venice-common/src/main/java/com/linkedin/venice/stats/AbstractVeniceAggStoreStats.java index 07483b6099..ed23d6e9fd 100644 --- a/internal/venice-common/src/main/java/com/linkedin/venice/stats/AbstractVeniceAggStoreStats.java +++ b/internal/venice-common/src/main/java/com/linkedin/venice/stats/AbstractVeniceAggStoreStats.java @@ -20,27 +20,19 @@ public AbstractVeniceAggStoreStats( MetricsRepository metricsRepository, StatsSupplier statsSupplier, ReadOnlyStoreRepository metadataRepository, - boolean isUnregisterMetricForDeletedStoreEnabled) { - super(clusterName, metricsRepository, statsSupplier); - this.isUnregisterMetricForDeletedStoreEnabled = isUnregisterMetricForDeletedStoreEnabled; - registerStoreDataChangedListenerIfRequired(metadataRepository); - } - - public AbstractVeniceAggStoreStats( - MetricsRepository metricsRepository, - StatsSupplier statsSupplier, - ReadOnlyStoreRepository metadataRepository, - boolean isUnregisterMetricForDeletedStoreEnabled) { - super(metricsRepository, statsSupplier); + boolean isUnregisterMetricForDeletedStoreEnabled, + boolean perClusterAggregate) { + super(clusterName, metricsRepository, statsSupplier, perClusterAggregate); this.isUnregisterMetricForDeletedStoreEnabled = isUnregisterMetricForDeletedStoreEnabled; registerStoreDataChangedListenerIfRequired(metadataRepository); } public AbstractVeniceAggStoreStats( + String clusterName, MetricsRepository metricsRepository, ReadOnlyStoreRepository metadataRepository, boolean isUnregisterMetricForDeletedStoreEnabled) { - super(metricsRepository); + super(clusterName, metricsRepository); this.isUnregisterMetricForDeletedStoreEnabled = isUnregisterMetricForDeletedStoreEnabled; registerStoreDataChangedListenerIfRequired(metadataRepository); } diff --git a/internal/venice-test-common/src/integrationTest/java/com/linkedin/venice/integration/utils/VeniceRouterWrapper.java b/internal/venice-test-common/src/integrationTest/java/com/linkedin/venice/integration/utils/VeniceRouterWrapper.java index 23260f6058..002ac5dc24 100644 --- a/internal/venice-test-common/src/integrationTest/java/com/linkedin/venice/integration/utils/VeniceRouterWrapper.java +++ b/internal/venice-test-common/src/integrationTest/java/com/linkedin/venice/integration/utils/VeniceRouterWrapper.java @@ -22,6 +22,16 @@ import static com.linkedin.venice.ConfigKeys.ZOOKEEPER_ADDRESS; import static com.linkedin.venice.VeniceConstants.DEFAULT_PER_ROUTER_READ_QUOTA; import static com.linkedin.venice.integration.utils.VeniceClusterWrapperConstants.ROUTER_PORT_TO_USE_IN_VENICE_ROUTER_WRAPPER; +import static com.linkedin.venice.router.RouterServer.ROUTER_SERVICE_METRIC_ENTITIES; +import static com.linkedin.venice.stats.VeniceMetricsConfig.OTEL_EXPORTER_OTLP_METRICS_DEFAULT_HISTOGRAM_AGGREGATION; +import static com.linkedin.venice.stats.VeniceMetricsConfig.OTEL_EXPORTER_OTLP_METRICS_DEFAULT_HISTOGRAM_AGGREGATION_MAX_BUCKETS; +import static com.linkedin.venice.stats.VeniceMetricsConfig.OTEL_EXPORTER_OTLP_METRICS_DEFAULT_HISTOGRAM_AGGREGATION_MAX_SCALE; +import static com.linkedin.venice.stats.VeniceMetricsConfig.OTEL_EXPORTER_OTLP_METRICS_ENDPOINT; +import static com.linkedin.venice.stats.VeniceMetricsConfig.OTEL_EXPORTER_OTLP_METRICS_PROTOCOL; +import static com.linkedin.venice.stats.VeniceMetricsConfig.OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE; +import static com.linkedin.venice.stats.VeniceMetricsConfig.OTEL_VENICE_METRICS_ENABLED; +import static com.linkedin.venice.stats.VeniceMetricsConfig.OTEL_VENICE_METRICS_EXPORT_TO_ENDPOINT; +import static com.linkedin.venice.stats.VeniceMetricsConfig.OTEL_VENICE_METRICS_EXPORT_TO_LOG; import com.linkedin.venice.client.store.ClientConfig; import com.linkedin.venice.helix.HelixBaseRoutingRepository; @@ -31,7 +41,7 @@ import com.linkedin.venice.router.RouterServer; import com.linkedin.venice.router.httpclient.StorageNodeClientType; import com.linkedin.venice.servicediscovery.ServiceDiscoveryAnnouncer; -import com.linkedin.venice.stats.TehutiUtils; +import com.linkedin.venice.stats.VeniceMetricsRepository; import com.linkedin.venice.tehuti.MetricsAware; import com.linkedin.venice.utils.PropertyBuilder; import com.linkedin.venice.utils.SslUtils; @@ -60,6 +70,7 @@ public class VeniceRouterWrapper extends ProcessWrapper implements MetricsAware public static final String CLUSTER_DISCOVERY_D2_SERVICE_NAME = ClientConfig.DEFAULT_CLUSTER_DISCOVERY_D2_SERVICE_NAME + "_test"; private static final String ROUTER_SERVICE_NAME = "venice-router"; + private static final String ROUTER_SERVICE_METRIC_PREFIX = "router"; private final VeniceProperties properties; private final String zkAddress; private RouterServer service; @@ -152,6 +163,16 @@ static StatefulServiceProvider generateService( .put(MAX_READ_CAPACITY, DEFAULT_PER_ROUTER_READ_QUOTA) .put(SYSTEM_SCHEMA_CLUSTER_NAME, clusterName) .put(ROUTER_STORAGE_NODE_CLIENT_TYPE, StorageNodeClientType.APACHE_HTTP_ASYNC_CLIENT.name()) + // OpenTelemetry configs + .put(OTEL_VENICE_METRICS_ENABLED, Boolean.TRUE.toString()) + .put(OTEL_VENICE_METRICS_EXPORT_TO_LOG, Boolean.TRUE.toString()) + .put(OTEL_VENICE_METRICS_EXPORT_TO_ENDPOINT, Boolean.TRUE.toString()) + .put(OTEL_EXPORTER_OTLP_METRICS_PROTOCOL, "http/protobuf") + .put(OTEL_EXPORTER_OTLP_METRICS_ENDPOINT, "http://localhost:4318/v1/metrics") + .put(OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE, "delta") + .put(OTEL_EXPORTER_OTLP_METRICS_DEFAULT_HISTOGRAM_AGGREGATION, "base2_exponential_bucket_histogram") + .put(OTEL_EXPORTER_OTLP_METRICS_DEFAULT_HISTOGRAM_AGGREGATION_MAX_SCALE, 3) + .put(OTEL_EXPORTER_OTLP_METRICS_DEFAULT_HISTOGRAM_AGGREGATION_MAX_BUCKETS, 250) .put(properties); // setup d2 config first @@ -175,7 +196,11 @@ static StatefulServiceProvider generateService( d2Servers, Optional.empty(), Optional.of(SslUtils.getVeniceLocalSslFactory()), - TehutiUtils.getMetricsRepository(ROUTER_SERVICE_NAME), + VeniceMetricsRepository.getVeniceMetricsRepository( + ROUTER_SERVICE_NAME, + ROUTER_SERVICE_METRIC_PREFIX, + ROUTER_SERVICE_METRIC_ENTITIES, + routerProperties.getAsMap()), D2TestUtils.getAndStartD2Client(zkAddress), CLUSTER_DISCOVERY_D2_SERVICE_NAME); return new VeniceRouterWrapper( @@ -237,7 +262,11 @@ protected void newProcess() { d2Servers, Optional.empty(), Optional.of(SslUtils.getVeniceLocalSslFactory()), - TehutiUtils.getMetricsRepository(ROUTER_SERVICE_NAME), + VeniceMetricsRepository.getVeniceMetricsRepository( + ROUTER_SERVICE_NAME, + ROUTER_SERVICE_METRIC_PREFIX, + ROUTER_SERVICE_METRIC_ENTITIES, + properties.getAsMap()), D2TestUtils.getAndStartD2Client(zkAddress), CLUSTER_DISCOVERY_D2_SERVICE_NAME); LOGGER.info("Started VeniceRouterWrapper: {}", this); diff --git a/internal/venice-test-common/src/integrationTest/java/com/linkedin/venice/router/api/TestVeniceDispatcher.java b/internal/venice-test-common/src/integrationTest/java/com/linkedin/venice/router/api/TestVeniceDispatcher.java index a3a73dc59f..1349a56226 100644 --- a/internal/venice-test-common/src/integrationTest/java/com/linkedin/venice/router/api/TestVeniceDispatcher.java +++ b/internal/venice-test-common/src/integrationTest/java/com/linkedin/venice/router/api/TestVeniceDispatcher.java @@ -37,6 +37,7 @@ import com.linkedin.venice.router.stats.RouteHttpRequestStats; import com.linkedin.venice.router.stats.RouterStats; import com.linkedin.venice.schema.avro.ReadAvroProtocolDefinition; +import com.linkedin.venice.stats.VeniceMetricsRepository; import com.linkedin.venice.utils.TestUtils; import io.netty.handler.codec.http.DefaultFullHttpResponse; import io.netty.handler.codec.http.DefaultHttpHeaders; @@ -46,7 +47,6 @@ import io.netty.handler.codec.http.HttpMethod; import io.netty.handler.codec.http.HttpResponseStatus; import io.netty.handler.codec.http.HttpVersion; -import io.tehuti.metrics.MetricsRepository; import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; @@ -322,7 +322,7 @@ private VeniceDispatcher getMockDispatcher(boolean forcePendingCheck, boolean fo doReturn(TimeUnit.MINUTES.toMillis(1)).when(routerConfig).getLeakedFutureCleanupThresholdMs(); doReturn(24).when(routerConfig).getIoThreadCountInPoolMode(); ReadOnlyStoreRepository mockStoreRepo = mock(ReadOnlyStoreRepository.class); - MetricsRepository mockMetricsRepo = new MetricsRepository(); + VeniceMetricsRepository mockMetricsRepo = new VeniceMetricsRepository(); RouterStats mockRouterStats = mock(RouterStats.class); RouteHttpRequestStats routeHttpRequestStats = mock(RouteHttpRequestStats.class); when(mockRouterStats.getStatsByType(any())).thenReturn(mock(AggRouterHttpRequestStats.class)); diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/AggPartitionHealthStats.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/AggPartitionHealthStats.java index 56eaa9e3ee..18b8791b46 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/AggPartitionHealthStats.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/AggPartitionHealthStats.java @@ -37,7 +37,7 @@ protected AggPartitionHealthStats( String clusterName, ReadOnlyStoreRepository storeRepository, PushMonitor pushMonitor) { - super(clusterName, null, (metricRepo, resourceName) -> new PartitionHealthStats(resourceName)); + super(clusterName, null, (metricRepo, resourceName, cluster) -> new PartitionHealthStats(resourceName), true); this.storeRepository = storeRepository; this.pushMonitor = pushMonitor; } @@ -48,7 +48,7 @@ public AggPartitionHealthStats( RoutingDataRepository routingDataRepository, ReadOnlyStoreRepository storeRepository, PushMonitor pushMonitor) { - super(clusterName, metricsRepository, PartitionHealthStats::new); + super(clusterName, metricsRepository, PartitionHealthStats::new, true); this.storeRepository = storeRepository; this.pushMonitor = pushMonitor; diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/PartitionHealthStats.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/PartitionHealthStats.java index 7d269238bc..4938ec8bea 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/PartitionHealthStats.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/PartitionHealthStats.java @@ -22,7 +22,7 @@ public PartitionHealthStats(String resourceName) { super(null, resourceName); } - public PartitionHealthStats(MetricsRepository metricsRepository, String name) { + public PartitionHealthStats(MetricsRepository metricsRepository, String name, String clusterName) { super(metricsRepository, name); synchronized (PartitionHealthStats.class) { Sensor existingMetric = metricsRepository.getSensor(getSensorFullName(UNDER_REPLICATED_PARTITION_SENSOR)); diff --git a/services/venice-router/build.gradle b/services/venice-router/build.gradle index 1eda645461..fac67a86eb 100644 --- a/services/venice-router/build.gradle +++ b/services/venice-router/build.gradle @@ -64,6 +64,7 @@ dependencies { implementation libraries.httpAsyncClient implementation project(':internal:alpini:router:alpini-router-api') implementation project(':internal:alpini:router:alpini-router-impl') + implementation libraries.opentelemetryApi testImplementation project(':clients:venice-thin-client') testImplementation libraries.kafkaClientsTest // TODO: Get rid of Kafka dependency in venice-common (used by TopicCreator) diff --git a/services/venice-router/src/main/java/com/linkedin/venice/router/RouterServer.java b/services/venice-router/src/main/java/com/linkedin/venice/router/RouterServer.java index c70f2bedd7..e2db771326 100644 --- a/services/venice-router/src/main/java/com/linkedin/venice/router/RouterServer.java +++ b/services/venice-router/src/main/java/com/linkedin/venice/router/RouterServer.java @@ -68,6 +68,7 @@ import com.linkedin.venice.router.stats.LongTailRetryStatsProvider; import com.linkedin.venice.router.stats.RouteHttpRequestStats; import com.linkedin.venice.router.stats.RouterHttpRequestStats; +import com.linkedin.venice.router.stats.RouterMetricEntity; import com.linkedin.venice.router.stats.RouterStats; import com.linkedin.venice.router.stats.RouterThrottleStats; import com.linkedin.venice.router.stats.SecurityStats; @@ -79,10 +80,11 @@ import com.linkedin.venice.security.SSLFactory; import com.linkedin.venice.service.AbstractVeniceService; import com.linkedin.venice.servicediscovery.ServiceDiscoveryAnnouncer; -import com.linkedin.venice.stats.TehutiUtils; import com.linkedin.venice.stats.ThreadPoolStats; import com.linkedin.venice.stats.VeniceJVMStats; +import com.linkedin.venice.stats.VeniceMetricsRepository; import com.linkedin.venice.stats.ZkClientStatusStats; +import com.linkedin.venice.stats.metrics.MetricEntity; import com.linkedin.venice.throttle.EventThrottler; import com.linkedin.venice.utils.DaemonThreadFactory; import com.linkedin.venice.utils.HelixUtils; @@ -107,6 +109,8 @@ import java.net.SocketAddress; import java.time.Duration; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; import java.util.Collections; import java.util.LinkedHashMap; import java.util.List; @@ -121,6 +125,7 @@ import java.util.concurrent.TimeUnit; import java.util.function.Consumer; import java.util.function.LongSupplier; +import java.util.stream.Collectors; import javax.annotation.Nonnull; import org.apache.helix.InstanceType; import org.apache.helix.manager.zk.ZKHelixManager; @@ -193,8 +198,10 @@ public class RouterServer extends AbstractVeniceService { // A map of optional ChannelHandlers that retains insertion order to be added at the end of the router pipeline private final Map optionalChannelHandlers = new LinkedHashMap<>(); - private static final String ROUTER_SERVICE_NAME = "venice-router"; - + public static final String ROUTER_SERVICE_NAME = "venice-router"; + public static final String ROUTER_SERVICE_METRIC_PREFIX = "router"; + public static final Collection ROUTER_SERVICE_METRIC_ENTITIES = Collections.unmodifiableList( + Arrays.stream(RouterMetricEntity.values()).map(RouterMetricEntity::getMetricEntity).collect(Collectors.toList())); /** * Thread number used to monitor the listening port; */ @@ -272,7 +279,11 @@ public RouterServer( serviceDiscoveryAnnouncers, accessController, sslFactory, - TehutiUtils.getMetricsRepository(ROUTER_SERVICE_NAME), + VeniceMetricsRepository.getVeniceMetricsRepository( + ROUTER_SERVICE_NAME, + ROUTER_SERVICE_METRIC_PREFIX, + ROUTER_SERVICE_METRIC_ENTITIES, + properties.getAsMap()), null, "venice-discovery"); } @@ -322,6 +333,7 @@ public RouterServer( config.getClusterName()); this.routerStats = new RouterStats<>( requestType -> new AggRouterHttpRequestStats( + config.getClusterName(), metricsRepository, requestType, config.isKeyValueProfilingEnabled(), @@ -381,7 +393,7 @@ private RouterServer( this.metaStoreShadowReader = Optional.empty(); this.metricsRepository = metricsRepository; - this.aggHostHealthStats = new AggHostHealthStats(metricsRepository); + this.aggHostHealthStats = new AggHostHealthStats(config.getClusterName(), metricsRepository); this.serviceDiscoveryAnnouncers = serviceDiscoveryAnnouncers; this.accessController = accessController; @@ -409,12 +421,23 @@ public RouterServer( List serviceDiscoveryAnnouncers, Optional sslFactory, HelixLiveInstanceMonitor liveInstanceMonitor) { - this(properties, serviceDiscoveryAnnouncers, Optional.empty(), sslFactory, new MetricsRepository(), false); + this( + properties, + serviceDiscoveryAnnouncers, + Optional.empty(), + sslFactory, + VeniceMetricsRepository.getVeniceMetricsRepository( + ROUTER_SERVICE_NAME, + ROUTER_SERVICE_METRIC_PREFIX, + ROUTER_SERVICE_METRIC_ENTITIES, + properties.getAsMap()), + false); this.routingDataRepository = routingDataRepository; this.hybridStoreQuotaRepository = hybridStoreQuotaRepository; this.metadataRepository = metadataRepository; this.routerStats = new RouterStats<>( requestType -> new AggRouterHttpRequestStats( + config.getClusterName(), metricsRepository, requestType, config.isKeyValueProfilingEnabled(), diff --git a/services/venice-router/src/main/java/com/linkedin/venice/router/api/RouterExceptionAndTrackingUtils.java b/services/venice-router/src/main/java/com/linkedin/venice/router/api/RouterExceptionAndTrackingUtils.java index 58cf615980..5c79e02630 100644 --- a/services/venice-router/src/main/java/com/linkedin/venice/router/api/RouterExceptionAndTrackingUtils.java +++ b/services/venice-router/src/main/java/com/linkedin/venice/router/api/RouterExceptionAndTrackingUtils.java @@ -154,7 +154,7 @@ private static void metricTracking( // If we don't know the actual store name, this error will only be aggregated in server level, but not // in store level if (responseStatus.equals(BAD_REQUEST) || responseStatus.equals(REQUEST_ENTITY_TOO_LARGE)) { - stats.recordBadRequest(storeName.orElse(null)); + stats.recordBadRequest(storeName.orElse(null), responseStatus); } else if (responseStatus.equals(TOO_MANY_REQUESTS)) { if (storeName.isPresent()) { if (requestType.isPresent()) { @@ -165,7 +165,7 @@ private static void metricTracking( * * TODO: Remove this metric after the above work is done... */ - stats.recordThrottledRequest(storeName.get()); + stats.recordThrottledRequest(storeName.get(), responseStatus); } } else { // not possible to have empty store name in this scenario @@ -198,7 +198,7 @@ private static void metricTracking( return; } - stats.recordUnhealthyRequest(storeName.orElse(null)); + stats.recordUnhealthyRequest(storeName.orElse(null), responseStatus); if (responseStatus.equals(SERVICE_UNAVAILABLE)) { if (storeName.isPresent()) { diff --git a/services/venice-router/src/main/java/com/linkedin/venice/router/api/VeniceResponseAggregator.java b/services/venice-router/src/main/java/com/linkedin/venice/router/api/VeniceResponseAggregator.java index a883dc0970..338d4b63b4 100644 --- a/services/venice-router/src/main/java/com/linkedin/venice/router/api/VeniceResponseAggregator.java +++ b/services/venice-router/src/main/java/com/linkedin/venice/router/api/VeniceResponseAggregator.java @@ -230,7 +230,7 @@ public FullHttpResponse buildResponse( } } - HttpResponseStatus responseStatus = finalResponse.status(); + HttpResponseStatus httpResponseStatus = finalResponse.status(); Map allMetrics = metrics.getMetrics(); /** * All the metrics in {@link com.linkedin.ddsstorage.router.api.MetricNames} are supported in {@link Metrics}. @@ -244,20 +244,20 @@ public FullHttpResponse buildResponse( // here... double latency = LatencyUtils.convertNSToMS(timeValue.getRawValue(TimeUnit.NANOSECONDS)); stats.recordLatency(storeName, latency); - if (HEALTHY_STATUSES.contains(responseStatus)) { + if (HEALTHY_STATUSES.contains(httpResponseStatus)) { routerStats.getStatsByType(RequestType.SINGLE_GET) .recordReadQuotaUsage(storeName, venicePath.getPartitionKeys().size()); if (isFastRequest(latency, requestType)) { - stats.recordHealthyRequest(storeName, latency); + stats.recordHealthyRequest(storeName, latency, httpResponseStatus); } else { - stats.recordTardyRequest(storeName, latency); + stats.recordTardyRequest(storeName, latency, httpResponseStatus); } - } else if (responseStatus.equals(TOO_MANY_REQUESTS)) { + } else if (httpResponseStatus.equals(TOO_MANY_REQUESTS)) { LOGGER.debug("request is rejected by storage node because quota is exceeded"); - stats.recordThrottledRequest(storeName, latency); + stats.recordThrottledRequest(storeName, latency, httpResponseStatus); } else { - LOGGER.debug("Unhealthy request detected, latency: {}ms, response status: {}", latency, responseStatus); - stats.recordUnhealthyRequest(storeName, latency); + LOGGER.debug("Unhealthy request detected, latency: {}ms, response status: {}", latency, httpResponseStatus); + stats.recordUnhealthyRequest(storeName, latency, httpResponseStatus); } } timeValue = allMetrics.get(ROUTER_RESPONSE_WAIT_TIME); @@ -275,7 +275,7 @@ public FullHttpResponse buildResponse( double routingTime = LatencyUtils.convertNSToMS(timeValue.getRawValue(TimeUnit.NANOSECONDS)); stats.recordRequestRoutingLatency(storeName, routingTime); } - if (HEALTHY_STATUSES.contains(responseStatus) && !venicePath.isStreamingRequest()) { + if (HEALTHY_STATUSES.contains(httpResponseStatus) && !venicePath.isStreamingRequest()) { // Only record successful response stats.recordResponseSize(storeName, finalResponse.content().readableBytes()); } diff --git a/services/venice-router/src/main/java/com/linkedin/venice/router/stats/AggHostHealthStats.java b/services/venice-router/src/main/java/com/linkedin/venice/router/stats/AggHostHealthStats.java index cc72755409..0343f2d0d1 100644 --- a/services/venice-router/src/main/java/com/linkedin/venice/router/stats/AggHostHealthStats.java +++ b/services/venice-router/src/main/java/com/linkedin/venice/router/stats/AggHostHealthStats.java @@ -2,18 +2,16 @@ import com.linkedin.venice.stats.AbstractVeniceAggStats; import com.linkedin.venice.stats.StatsUtils; -import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; import io.tehuti.metrics.MetricsRepository; -import java.util.Map; public class AggHostHealthStats extends AbstractVeniceAggStats { - private final Map hostHealthStatsMap = new VeniceConcurrentHashMap<>(); - - public AggHostHealthStats(MetricsRepository metricsRepository) { + public AggHostHealthStats(String clusterName, MetricsRepository metricsRepository) { super( + clusterName, metricsRepository, - (repo, hostName) -> new HostHealthStats(repo, StatsUtils.convertHostnameToMetricName(hostName))); + (repo, hostName, cluster) -> new HostHealthStats(repo, StatsUtils.convertHostnameToMetricName(hostName)), + false); } private HostHealthStats getHostStats(String hostName) { diff --git a/services/venice-router/src/main/java/com/linkedin/venice/router/stats/AggRouterHttpRequestStats.java b/services/venice-router/src/main/java/com/linkedin/venice/router/stats/AggRouterHttpRequestStats.java index 517f1485e7..7f59b12b2b 100644 --- a/services/venice-router/src/main/java/com/linkedin/venice/router/stats/AggRouterHttpRequestStats.java +++ b/services/venice-router/src/main/java/com/linkedin/venice/router/stats/AggRouterHttpRequestStats.java @@ -6,6 +6,7 @@ import com.linkedin.venice.stats.AbstractVeniceAggStats; import com.linkedin.venice.stats.AbstractVeniceAggStoreStats; import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; +import io.netty.handler.codec.http.HttpResponseStatus; import io.tehuti.metrics.MetricsRepository; import java.util.Map; import java.util.function.Function; @@ -15,25 +16,33 @@ public class AggRouterHttpRequestStats extends AbstractVeniceAggStoreStats scatterGatherStatsMap = new VeniceConcurrentHashMap<>(); public AggRouterHttpRequestStats( + String clusterName, MetricsRepository metricsRepository, RequestType requestType, ReadOnlyStoreRepository metadataRepository, boolean isUnregisterMetricForDeletedStoreEnabled) { - this(metricsRepository, requestType, false, metadataRepository, isUnregisterMetricForDeletedStoreEnabled); + this( + clusterName, + metricsRepository, + requestType, + false, + metadataRepository, + isUnregisterMetricForDeletedStoreEnabled); } public AggRouterHttpRequestStats( + String cluster, MetricsRepository metricsRepository, RequestType requestType, boolean isKeyValueProfilingEnabled, ReadOnlyStoreRepository metadataRepository, boolean isUnregisterMetricForDeletedStoreEnabled) { - super(metricsRepository, metadataRepository, isUnregisterMetricForDeletedStoreEnabled); + super(cluster, metricsRepository, metadataRepository, isUnregisterMetricForDeletedStoreEnabled); /** * Use a setter function to bypass the restriction that the supertype constructor could not * touch member fields of current object. */ - setStatsSupplier((metricsRepo, storeName) -> { + setStatsSupplier((metricsRepo, storeName, clusterName) -> { ScatterGatherStats stats; if (storeName.equals(AbstractVeniceAggStats.STORE_NAME_FOR_TOTAL_STAT)) { stats = new AggScatterGatherStats(); @@ -41,7 +50,13 @@ public AggRouterHttpRequestStats( stats = scatterGatherStatsMap.computeIfAbsent(storeName, k -> new ScatterGatherStats()); } - return new RouterHttpRequestStats(metricsRepo, storeName, requestType, stats, isKeyValueProfilingEnabled); + return new RouterHttpRequestStats( + metricsRepo, + storeName, + clusterName, + requestType, + stats, + isKeyValueProfilingEnabled); }); } @@ -50,19 +65,19 @@ public ScatterGatherStats getScatterGatherStatsForStore(String storeName) { } public void recordRequest(String storeName) { - totalStats.recordRequest(); - getStoreStats(storeName).recordRequest(); + totalStats.recordIncomingRequest(); + getStoreStats(storeName).recordIncomingRequest(); } - public void recordHealthyRequest(String storeName, double latency) { - totalStats.recordHealthyRequest(latency); - getStoreStats(storeName).recordHealthyRequest(latency); + public void recordHealthyRequest(String storeName, double latency, HttpResponseStatus responseStatus) { + totalStats.recordHealthyRequest(latency, responseStatus); + getStoreStats(storeName).recordHealthyRequest(latency, responseStatus); } - public void recordUnhealthyRequest(String storeName) { - totalStats.recordUnhealthyRequest(); + public void recordUnhealthyRequest(String storeName, HttpResponseStatus responseStatus) { + totalStats.recordUnhealthyRequest(responseStatus); if (storeName != null) { - getStoreStats(storeName).recordUnhealthyRequest(); + getStoreStats(storeName).recordUnhealthyRequest(responseStatus); } } @@ -71,10 +86,10 @@ public void recordUnavailableReplicaStreamingRequest(String storeName) { getStoreStats(storeName).recordUnavailableReplicaStreamingRequest(); } - public void recordUnhealthyRequest(String storeName, double latency) { - totalStats.recordUnhealthyRequest(latency); + public void recordUnhealthyRequest(String storeName, double latency, HttpResponseStatus responseStatus) { + totalStats.recordUnhealthyRequest(latency, responseStatus); if (storeName != null) { - getStoreStats(storeName).recordUnhealthyRequest(latency); + getStoreStats(storeName).recordUnhealthyRequest(latency, responseStatus); } } @@ -89,9 +104,9 @@ public void recordReadQuotaUsage(String storeName, int quotaUsage) { getStoreStats(storeName).recordReadQuotaUsage(quotaUsage); } - public void recordTardyRequest(String storeName, double latency) { - totalStats.recordTardyRequest(latency); - getStoreStats(storeName).recordTardyRequest(latency); + public void recordTardyRequest(String storeName, double latency, HttpResponseStatus responseStatus) { + totalStats.recordTardyRequest(latency, responseStatus); + getStoreStats(storeName).recordTardyRequest(latency, responseStatus); } /** @@ -101,20 +116,20 @@ public void recordTardyRequest(String storeName, double latency) { * * TODO: Remove this overload after fixing the above. */ - public void recordThrottledRequest(String storeName) { - totalStats.recordThrottledRequest(); - getStoreStats(storeName).recordThrottledRequest(); + public void recordThrottledRequest(String storeName, HttpResponseStatus httpResponseStatus) { + totalStats.recordThrottledRequest(httpResponseStatus); + getStoreStats(storeName).recordThrottledRequest(httpResponseStatus); } - public void recordThrottledRequest(String storeName, double latency) { - totalStats.recordThrottledRequest(latency); - getStoreStats(storeName).recordThrottledRequest(latency); + public void recordThrottledRequest(String storeName, double latency, HttpResponseStatus httpResponseStatus) { + totalStats.recordThrottledRequest(latency, httpResponseStatus); + getStoreStats(storeName).recordThrottledRequest(latency, httpResponseStatus); } - public void recordBadRequest(String storeName) { - totalStats.recordBadRequest(); + public void recordBadRequest(String storeName, HttpResponseStatus responseStatus) { + totalStats.recordBadRequest(responseStatus); if (storeName != null) { - getStoreStats(storeName).recordBadRequest(); + getStoreStats(storeName).recordBadRequest(responseStatus); } } @@ -146,7 +161,9 @@ public void recordFanoutRequestCount(String storeName, int count) { public void recordLatency(String storeName, double latency) { totalStats.recordLatency(latency); - getStoreStats(storeName).recordLatency(latency); + if (storeName != null) { + getStoreStats(storeName).recordLatency(latency); + } } public void recordResponseWaitingTime(String storeName, double waitingTime) { diff --git a/services/venice-router/src/main/java/com/linkedin/venice/router/stats/RouterHttpRequestStats.java b/services/venice-router/src/main/java/com/linkedin/venice/router/stats/RouterHttpRequestStats.java index d53abc2177..93fef88a5b 100644 --- a/services/venice-router/src/main/java/com/linkedin/venice/router/stats/RouterHttpRequestStats.java +++ b/services/venice-router/src/main/java/com/linkedin/venice/router/stats/RouterHttpRequestStats.java @@ -1,13 +1,50 @@ package com.linkedin.venice.router.stats; +import static com.linkedin.venice.router.RouterServer.ROUTER_SERVICE_METRIC_PREFIX; +import static com.linkedin.venice.router.RouterServer.ROUTER_SERVICE_NAME; +import static com.linkedin.venice.router.stats.RouterMetricEntity.ABORTED_RETRY_COUNT; +import static com.linkedin.venice.router.stats.RouterMetricEntity.ALLOWED_RETRY_COUNT; +import static com.linkedin.venice.router.stats.RouterMetricEntity.CALL_COUNT; +import static com.linkedin.venice.router.stats.RouterMetricEntity.CALL_KEY_COUNT; +import static com.linkedin.venice.router.stats.RouterMetricEntity.CALL_TIME; +import static com.linkedin.venice.router.stats.RouterMetricEntity.DISALLOWED_RETRY_COUNT; +import static com.linkedin.venice.router.stats.RouterMetricEntity.INCOMING_CALL_COUNT; +import static com.linkedin.venice.router.stats.RouterMetricEntity.RETRY_COUNT; +import static com.linkedin.venice.router.stats.RouterMetricEntity.RETRY_DELAY; import static com.linkedin.venice.stats.AbstractVeniceAggStats.STORE_NAME_FOR_TOTAL_STAT; - +import static com.linkedin.venice.stats.dimensions.HttpResponseStatusCodeCategory.getVeniceHttpResponseStatusCodeCategory; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.HTTP_RESPONSE_STATUS_CODE; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.HTTP_RESPONSE_STATUS_CODE_CATEGORY; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_CLUSTER_NAME; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REQUEST_METHOD; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REQUEST_RETRY_ABORT_REASON; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REQUEST_RETRY_TYPE; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REQUEST_VALIDATION_OUTCOME; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_RESPONSE_STATUS_CODE_CATEGORY; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_NAME; +import static java.util.Collections.singletonList; + +import com.linkedin.alpini.base.misc.CollectionUtil; import com.linkedin.alpini.router.monitoring.ScatterGatherStats; import com.linkedin.venice.common.VeniceSystemStoreUtils; import com.linkedin.venice.read.RequestType; import com.linkedin.venice.stats.AbstractVeniceHttpStats; import com.linkedin.venice.stats.LambdaStat; import com.linkedin.venice.stats.TehutiUtils; +import com.linkedin.venice.stats.VeniceMetricsConfig; +import com.linkedin.venice.stats.VeniceMetricsRepository; +import com.linkedin.venice.stats.VeniceOpenTelemetryMetricNamingFormat; +import com.linkedin.venice.stats.VeniceOpenTelemetryMetricsRepository; +import com.linkedin.venice.stats.dimensions.RequestRetryAbortReason; +import com.linkedin.venice.stats.dimensions.RequestRetryType; +import com.linkedin.venice.stats.dimensions.RequestValidationOutcome; +import com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions; +import com.linkedin.venice.stats.dimensions.VeniceResponseStatusCategory; +import com.linkedin.venice.stats.metrics.MetricEntityState; +import com.linkedin.venice.stats.metrics.TehutiMetricNameEnum; +import io.netty.handler.codec.http.HttpResponseStatus; +import io.opentelemetry.api.common.Attributes; +import io.opentelemetry.api.common.AttributesBuilder; import io.tehuti.Metric; import io.tehuti.metrics.MeasurableStat; import io.tehuti.metrics.MetricConfig; @@ -21,98 +58,223 @@ import io.tehuti.metrics.stats.OccurrenceRate; import io.tehuti.metrics.stats.Rate; import io.tehuti.metrics.stats.Total; +import java.util.Arrays; +import java.util.List; +import java.util.Map; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; public class RouterHttpRequestStats extends AbstractVeniceHttpStats { private static final MetricConfig METRIC_CONFIG = new MetricConfig().timeWindow(10, TimeUnit.SECONDS); - private static final MetricsRepository localMetricRepo = new MetricsRepository(METRIC_CONFIG); + private static final VeniceMetricsRepository localMetricRepo = new VeniceMetricsRepository( + new VeniceMetricsConfig.Builder().setServiceName(ROUTER_SERVICE_NAME) + .setMetricPrefix(ROUTER_SERVICE_METRIC_PREFIX) + .setTehutiMetricConfig(METRIC_CONFIG) + .build()); + private final static Sensor totalInflightRequestSensor = localMetricRepo.sensor("total_inflight_request"); static { totalInflightRequestSensor.add("total_inflight_request_count", new Rate()); } - private final Sensor requestSensor; - private final Sensor healthySensor; - private final Sensor unhealthySensor; - private final Sensor tardySensor; + + /** metrics to track incoming requests */ + private final MetricEntityState incomingRequestMetric; + + /** metrics to track response handling */ + private final MetricEntityState requestMetric; private final Sensor healthyRequestRateSensor; private final Sensor tardyRequestRatioSensor; - private final Sensor throttleSensor; - private final Sensor errorRetryCountSensor; - - private final Sensor latencySensor; - private final Sensor healthyRequestLatencySensor; - private final Sensor unhealthyRequestLatencySensor; - private final Sensor tardyRequestLatencySensor; - private final Sensor throttledRequestLatencySensor; + + /** latency metrics */ + private final Sensor latencyTehutiSensor; // This can be removed while removing tehuti + private final MetricEntityState latencyMetric; + + /** retry metrics */ + private final MetricEntityState retryCountMetric; + private final MetricEntityState allowedRetryCountMetric; + private final MetricEntityState disallowedRetryCountMetric; + private final MetricEntityState retryDelayMetric; + + /** retry aborted metrics */ + private final MetricEntityState abortedRetryCountMetric; + + /** key count metrics */ + private final MetricEntityState keyCountMetric; + + /** OTel metrics yet to be added */ private final Sensor requestSizeSensor; private final Sensor compressedResponseSizeSensor; private final Sensor responseSizeSensor; - private final Sensor badRequestSensor; - private final Sensor badRequestKeyCountSensor; private final Sensor requestThrottledByRouterCapacitySensor; private final Sensor decompressionTimeSensor; private final Sensor routerResponseWaitingTimeSensor; private final Sensor fanoutRequestCountSensor; private final Sensor quotaSensor; private final Sensor findUnhealthyHostRequestSensor; - private final Sensor keyNumSensor; // Reflect the real request usage, e.g count each key as an unit of request usage. private final Sensor requestUsageSensor; private final Sensor requestParsingLatencySensor; private final Sensor requestRoutingLatencySensor; private final Sensor unAvailableRequestSensor; - private final Sensor delayConstraintAbortedRetryRequest; - private final Sensor slowRouteAbortedRetryRequest; - private final Sensor retryRouteLimitAbortedRetryRequest; - private final Sensor noAvailableReplicaAbortedRetryRequest; private final Sensor readQuotaUsageSensor; private final Sensor inFlightRequestSensor; private final AtomicInteger currentInFlightRequest; private final Sensor unavailableReplicaStreamingRequestSensor; - private final Sensor allowedRetryRequestSensor; - private final Sensor disallowedRetryRequestSensor; - private final Sensor errorRetryAttemptTriggeredByPendingRequestCheckSensor; - private final Sensor retryDelaySensor; private final Sensor multiGetFallbackSensor; private final Sensor metaStoreShadowReadSensor; private Sensor keySizeSensor; + + /** TODO: Need to clarify the usage and add new OTel metrics or add it as a part of existing ones */ + private final Sensor errorRetryAttemptTriggeredByPendingRequestCheckSensor; + private final String systemStoreName; + private final Attributes commonMetricDimensions; + private final boolean emitOpenTelemetryMetrics; + private final VeniceOpenTelemetryMetricNamingFormat openTelemetryMetricFormat; // QPS metrics public RouterHttpRequestStats( MetricsRepository metricsRepository, String storeName, + String clusterName, RequestType requestType, ScatterGatherStats scatterGatherStats, boolean isKeyValueProfilingEnabled) { super(metricsRepository, storeName, requestType); + VeniceOpenTelemetryMetricsRepository otelRepository; + if (metricsRepository instanceof VeniceMetricsRepository) { + VeniceMetricsRepository veniceMetricsRepository = (VeniceMetricsRepository) metricsRepository; + VeniceMetricsConfig veniceMetricsConfig = veniceMetricsRepository.getVeniceMetricsConfig(); + emitOpenTelemetryMetrics = veniceMetricsConfig.emitOtelMetrics(); + openTelemetryMetricFormat = veniceMetricsConfig.getMetricNamingFormat(); + otelRepository = veniceMetricsRepository.getOpenTelemetryMetricsRepository(); + AttributesBuilder attributesBuilder = Attributes.builder() + .put(getDimensionName(VENICE_STORE_NAME), storeName) + .put(getDimensionName(VENICE_REQUEST_METHOD), requestType.name().toLowerCase()) + .put(getDimensionName(VENICE_CLUSTER_NAME), clusterName); + // add custom dimensions passed in by the user + for (Map.Entry entry: veniceMetricsConfig.getOtelCustomDimensionsMap().entrySet()) { + attributesBuilder.put(entry.getKey(), entry.getValue()); + } + commonMetricDimensions = attributesBuilder.build(); + } else { + emitOpenTelemetryMetrics = false; + openTelemetryMetricFormat = VeniceOpenTelemetryMetricNamingFormat.SNAKE_CASE; + commonMetricDimensions = null; + otelRepository = null; + } + this.systemStoreName = VeniceSystemStoreUtils.extractSystemStoreType(storeName); Rate requestRate = new OccurrenceRate(); Rate healthyRequestRate = new OccurrenceRate(); Rate tardyRequestRate = new OccurrenceRate(); - requestSensor = registerSensor("request", new Count(), requestRate); - healthySensor = registerSensor("healthy_request", new Count(), healthyRequestRate); - unhealthySensor = registerSensor("unhealthy_request", new Count()); - unavailableReplicaStreamingRequestSensor = registerSensor("unavailable_replica_streaming_request", new Count()); - tardySensor = registerSensor("tardy_request", new Count(), tardyRequestRate); + + incomingRequestMetric = new MetricEntityState( + INCOMING_CALL_COUNT.getMetricEntity(), + otelRepository, + this::registerSensorFinal, + CollectionUtil.>mapBuilder() + .put(RouterTehutiMetricNameEnum.REQUEST, Arrays.asList(new Count(), requestRate)) + .build()); + healthyRequestRateSensor = registerSensor(new TehutiUtils.SimpleRatioStat(healthyRequestRate, requestRate, "healthy_request_ratio")); tardyRequestRatioSensor = registerSensor(new TehutiUtils.SimpleRatioStat(tardyRequestRate, requestRate, "tardy_request_ratio")); - throttleSensor = registerSensor("throttled_request", new Count()); - errorRetryCountSensor = registerSensor("error_retry", new Count()); - badRequestSensor = registerSensor("bad_request", new Count()); - badRequestKeyCountSensor = registerSensor("bad_request_key_count", new OccurrenceRate(), new Avg(), new Max()); + + requestMetric = new MetricEntityState( + CALL_COUNT.getMetricEntity(), + otelRepository, + this::registerSensorFinal, + CollectionUtil.>mapBuilder() + .put(RouterTehutiMetricNameEnum.HEALTHY_REQUEST, Arrays.asList(new Count(), healthyRequestRate)) + .put(RouterTehutiMetricNameEnum.UNHEALTHY_REQUEST, singletonList(new Count())) + .put(RouterTehutiMetricNameEnum.TARDY_REQUEST, Arrays.asList(new Count(), tardyRequestRate)) + .put(RouterTehutiMetricNameEnum.THROTTLED_REQUEST, singletonList(new Count())) + .put(RouterTehutiMetricNameEnum.BAD_REQUEST, singletonList(new Count())) + .build()); + + latencyTehutiSensor = registerSensorWithDetailedPercentiles("latency", new Avg(), new Max(0)); + latencyMetric = new MetricEntityState( + CALL_TIME.getMetricEntity(), + otelRepository, + this::registerSensorFinal, + CollectionUtil.>mapBuilder() + .put( + RouterTehutiMetricNameEnum.HEALTHY_REQUEST_LATENCY, + Arrays.asList( + new Avg(), + new Max(0), + TehutiUtils.getPercentileStatForNetworkLatency( + getName(), + getFullMetricName(RouterTehutiMetricNameEnum.HEALTHY_REQUEST_LATENCY.getMetricName())))) + .put(RouterTehutiMetricNameEnum.UNHEALTHY_REQUEST_LATENCY, Arrays.asList(new Avg(), new Max(0))) + .put(RouterTehutiMetricNameEnum.TARDY_REQUEST_LATENCY, Arrays.asList(new Avg(), new Max(0))) + .put(RouterTehutiMetricNameEnum.THROTTLED_REQUEST_LATENCY, Arrays.asList(new Avg(), new Max(0))) + .build()); + + retryCountMetric = new MetricEntityState( + RETRY_COUNT.getMetricEntity(), + otelRepository, + this::registerSensorFinal, + CollectionUtil.>mapBuilder() + .put(RouterTehutiMetricNameEnum.ERROR_RETRY, singletonList(new Count())) + .build()); + + allowedRetryCountMetric = new MetricEntityState( + ALLOWED_RETRY_COUNT.getMetricEntity(), + otelRepository, + this::registerSensorFinal, + CollectionUtil.>mapBuilder() + .put(RouterTehutiMetricNameEnum.ALLOWED_RETRY_REQUEST_COUNT, singletonList(new OccurrenceRate())) + .build()); + + disallowedRetryCountMetric = new MetricEntityState( + DISALLOWED_RETRY_COUNT.getMetricEntity(), + otelRepository, + this::registerSensorFinal, + CollectionUtil.>mapBuilder() + .put(RouterTehutiMetricNameEnum.DISALLOWED_RETRY_REQUEST_COUNT, singletonList(new OccurrenceRate())) + .build()); + + retryDelayMetric = new MetricEntityState( + RETRY_DELAY.getMetricEntity(), + otelRepository, + this::registerSensorFinal, + CollectionUtil.>mapBuilder() + .put(RouterTehutiMetricNameEnum.RETRY_DELAY, Arrays.asList(new Avg(), new Max())) + .build()); + + abortedRetryCountMetric = new MetricEntityState( + ABORTED_RETRY_COUNT.getMetricEntity(), + otelRepository, + this::registerSensorFinal, + CollectionUtil.>mapBuilder() + .put(RouterTehutiMetricNameEnum.DELAY_CONSTRAINT_ABORTED_RETRY_REQUEST, singletonList(new Count())) + .put(RouterTehutiMetricNameEnum.SLOW_ROUTE_ABORTED_RETRY_REQUEST, singletonList(new Count())) + .put(RouterTehutiMetricNameEnum.RETRY_ROUTE_LIMIT_ABORTED_RETRY_REQUEST, singletonList(new Count())) + .put(RouterTehutiMetricNameEnum.NO_AVAILABLE_REPLICA_ABORTED_RETRY_REQUEST, singletonList(new Count())) + .build()); + + keyCountMetric = new MetricEntityState( + CALL_KEY_COUNT.getMetricEntity(), + otelRepository, + this::registerSensorFinal, + CollectionUtil.>mapBuilder() + .put(RouterTehutiMetricNameEnum.KEY_NUM, Arrays.asList(new OccurrenceRate(), new Avg(), new Max(0))) + .put( + RouterTehutiMetricNameEnum.BAD_REQUEST_KEY_COUNT, + Arrays.asList(new OccurrenceRate(), new Avg(), new Max(0))) + .build()); + + errorRetryAttemptTriggeredByPendingRequestCheckSensor = + registerSensor("error_retry_attempt_triggered_by_pending_request_check", new OccurrenceRate()); + + unavailableReplicaStreamingRequestSensor = registerSensor("unavailable_replica_streaming_request", new Count()); requestThrottledByRouterCapacitySensor = registerSensor("request_throttled_by_router_capacity", new Count()); fanoutRequestCountSensor = registerSensor("fanout_request_count", new Avg(), new Max(0)); - latencySensor = registerSensorWithDetailedPercentiles("latency", new Avg(), new Max(0)); - healthyRequestLatencySensor = - registerSensorWithDetailedPercentiles("healthy_request_latency", new Avg(), new Max(0)); - unhealthyRequestLatencySensor = registerSensor("unhealthy_request_latency", new Avg(), new Max(0)); - tardyRequestLatencySensor = registerSensor("tardy_request_latency", new Avg(), new Max(0)); - throttledRequestLatencySensor = registerSensor("throttled_request_latency", new Avg(), new Max(0)); + routerResponseWaitingTimeSensor = registerSensor( "response_waiting_time", TehutiUtils.getPercentileStat(getName(), getFullMetricName("response_waiting_time"))); @@ -146,7 +308,6 @@ public RouterHttpRequestStats( (ignored, ignored2) -> scatterGatherStats.getTotalRetriesWinner(), "retry_faster_than_original_count")); - keyNumSensor = registerSensor("key_num", new Avg(), new Max(0)); /** * request_usage.Total is incoming KPS while request_usage.OccurrenceRate is QPS */ @@ -158,11 +319,6 @@ public RouterHttpRequestStats( unAvailableRequestSensor = registerSensor("unavailable_request", new Count()); - delayConstraintAbortedRetryRequest = registerSensor("delay_constraint_aborted_retry_request", new Count()); - slowRouteAbortedRetryRequest = registerSensor("slow_route_aborted_retry_request", new Count()); - retryRouteLimitAbortedRetryRequest = registerSensor("retry_route_limit_aborted_retry_request", new Count()); - noAvailableReplicaAbortedRetryRequest = registerSensor("no_available_replica_aborted_retry_request", new Count()); - readQuotaUsageSensor = registerSensor("read_quota_usage_kps", new Total()); inFlightRequestSensor = registerSensor("in_flight_request_count", new Min(), new Max(0), new Avg()); @@ -189,42 +345,50 @@ public RouterHttpRequestStats( } currentInFlightRequest = new AtomicInteger(); - allowedRetryRequestSensor = registerSensor("allowed_retry_request_count", new OccurrenceRate()); - disallowedRetryRequestSensor = registerSensor("disallowed_retry_request_count", new OccurrenceRate()); - errorRetryAttemptTriggeredByPendingRequestCheckSensor = - registerSensor("error_retry_attempt_triggered_by_pending_request_check", new OccurrenceRate()); - retryDelaySensor = registerSensor("retry_delay", new Avg(), new Max()); metaStoreShadowReadSensor = registerSensor("meta_store_shadow_read", new OccurrenceRate()); } + private String getDimensionName(VeniceMetricsDimensions dimension) { + return dimension.getDimensionName(openTelemetryMetricFormat); + } + /** * We record this at the beginning of request handling, so we don't know the latency yet... All specific * types of requests also have their latencies logged at the same time. */ - public void recordRequest() { - requestSensor.record(); + public void recordIncomingRequest() { + incomingRequestMetric.record(RouterTehutiMetricNameEnum.REQUEST, 1, commonMetricDimensions); inFlightRequestSensor.record(currentInFlightRequest.incrementAndGet()); totalInflightRequestSensor.record(); } - public void recordHealthyRequest(Double latency) { - healthySensor.record(); + public void recordHealthyRequest(Double latency, HttpResponseStatus responseStatus) { + TehutiMetricNameEnum tehutiMetricNameEnum = RouterTehutiMetricNameEnum.HEALTHY_REQUEST; + VeniceResponseStatusCategory veniceResponseStatusCategory = VeniceResponseStatusCategory.HEALTHY; + recordRequestMetric(tehutiMetricNameEnum, responseStatus, veniceResponseStatusCategory); if (latency != null) { - healthyRequestLatencySensor.record(latency); + recordLatencyMetric(tehutiMetricNameEnum, latency, responseStatus, veniceResponseStatusCategory); } } - public void recordUnhealthyRequest() { - unhealthySensor.record(); + public void recordUnhealthyRequest(HttpResponseStatus responseStatus) { + recordRequestMetric( + RouterTehutiMetricNameEnum.UNHEALTHY_REQUEST, + responseStatus, + VeniceResponseStatusCategory.UNHEALTHY); } - public void recordUnavailableReplicaStreamingRequest() { - unavailableReplicaStreamingRequestSensor.record(); + public void recordUnhealthyRequest(double latency, HttpResponseStatus responseStatus) { + recordUnhealthyRequest(responseStatus); + recordLatencyMetric( + RouterTehutiMetricNameEnum.UNHEALTHY_REQUEST_LATENCY, + latency, + responseStatus, + VeniceResponseStatusCategory.UNHEALTHY); } - public void recordUnhealthyRequest(double latency) { - recordUnhealthyRequest(); - unhealthyRequestLatencySensor.record(latency); + public void recordUnavailableReplicaStreamingRequest() { + unavailableReplicaStreamingRequestSensor.record(); } /** @@ -235,14 +399,20 @@ public void recordReadQuotaUsage(int quotaUsage) { readQuotaUsageSensor.record(quotaUsage); } - public void recordTardyRequest(double latency) { - tardySensor.record(); - tardyRequestLatencySensor.record(latency); + public void recordTardyRequest(double latency, HttpResponseStatus responseStatus) { + TehutiMetricNameEnum tehutiMetricNameEnum = RouterTehutiMetricNameEnum.TARDY_REQUEST; + VeniceResponseStatusCategory veniceResponseStatusCategory = VeniceResponseStatusCategory.TARDY; + recordRequestMetric(tehutiMetricNameEnum, responseStatus, veniceResponseStatusCategory); + recordLatencyMetric(tehutiMetricNameEnum, latency, responseStatus, veniceResponseStatusCategory); } - public void recordThrottledRequest(double latency) { - recordThrottledRequest(); - throttledRequestLatencySensor.record(latency); + public void recordThrottledRequest(double latency, HttpResponseStatus responseStatus) { + recordThrottledRequest(responseStatus); + recordLatencyMetric( + RouterTehutiMetricNameEnum.THROTTLED_REQUEST_LATENCY, + latency, + responseStatus, + VeniceResponseStatusCategory.THROTTLED); } /** @@ -252,20 +422,50 @@ public void recordThrottledRequest(double latency) { * * TODO: Remove this overload after fixing the above. */ - public void recordThrottledRequest() { - throttleSensor.record(); + public void recordThrottledRequest(HttpResponseStatus responseStatus) { + recordRequestMetric( + RouterTehutiMetricNameEnum.THROTTLED_REQUEST, + responseStatus, + VeniceResponseStatusCategory.THROTTLED); } public void recordErrorRetryCount() { - errorRetryCountSensor.record(); + recordRetryTriggeredSensorOtel(RequestRetryType.ERROR_RETRY); } - public void recordBadRequest() { - badRequestSensor.record(); + public void recordRetryTriggeredSensorOtel(RequestRetryType retryType) { + Attributes dimensions = null; + if (emitOpenTelemetryMetrics) { + dimensions = Attributes.builder() + .putAll(commonMetricDimensions) + .put(getDimensionName(VENICE_REQUEST_RETRY_TYPE), retryType.getRetryType()) + .build(); + } + retryCountMetric.record(RouterTehutiMetricNameEnum.ERROR_RETRY, 1, dimensions); + } + + public void recordAbortedRetrySensorOtel( + TehutiMetricNameEnum tehutiMetricNameEnum, + RequestRetryAbortReason abortReason) { + Attributes dimensions = null; + if (emitOpenTelemetryMetrics) { + dimensions = Attributes.builder() + .putAll(commonMetricDimensions) + .put(getDimensionName(VENICE_REQUEST_RETRY_ABORT_REASON), abortReason.getAbortReason()) + .build(); + } + abortedRetryCountMetric.record(tehutiMetricNameEnum, 1, dimensions); + } + + public void recordBadRequest(HttpResponseStatus responseStatus) { + recordRequestMetric( + RouterTehutiMetricNameEnum.BAD_REQUEST, + responseStatus, + VeniceResponseStatusCategory.BAD_REQUEST); } public void recordBadRequestKeyCount(int keyCount) { - badRequestKeyCountSensor.record(keyCount); + recordKeyCountMetric(keyCount, RequestValidationOutcome.INVALID_KEY_COUNT_LIMIT_EXCEEDED); } public void recordRequestThrottledByRouterCapacity() { @@ -279,7 +479,44 @@ public void recordFanoutRequestCount(int count) { } public void recordLatency(double latency) { - latencySensor.record(latency); + latencyTehutiSensor.record(latency); + } + + public void recordLatencyMetric( + TehutiMetricNameEnum tehutiMetricNameEnum, + double latency, + HttpResponseStatus responseStatus, + VeniceResponseStatusCategory veniceResponseStatusCategory) { + Attributes dimensions = null; + if (emitOpenTelemetryMetrics) { + dimensions = Attributes.builder() + .putAll(commonMetricDimensions) + // Don't add HTTP_RESPONSE_STATUS_CODE to reduce the cardinality for histogram + .put( + getDimensionName(HTTP_RESPONSE_STATUS_CODE_CATEGORY), + getVeniceHttpResponseStatusCodeCategory(responseStatus)) + .put(getDimensionName(VENICE_RESPONSE_STATUS_CODE_CATEGORY), veniceResponseStatusCategory.getCategory()) + .build(); + } + latencyMetric.record(tehutiMetricNameEnum, latency, dimensions); + } + + public void recordRequestMetric( + TehutiMetricNameEnum tehutiMetricNameEnum, + HttpResponseStatus responseStatus, + VeniceResponseStatusCategory veniceResponseStatusCategory) { + Attributes dimensions = null; + if (emitOpenTelemetryMetrics) { + dimensions = Attributes.builder() + .putAll(commonMetricDimensions) + .put( + getDimensionName(HTTP_RESPONSE_STATUS_CODE_CATEGORY), + getVeniceHttpResponseStatusCodeCategory(responseStatus)) + .put(getDimensionName(VENICE_RESPONSE_STATUS_CODE_CATEGORY), veniceResponseStatusCategory.getCategory()) + .put(getDimensionName(HTTP_RESPONSE_STATUS_CODE), responseStatus.codeAsText().toString()) + .build(); + } + requestMetric.record(tehutiMetricNameEnum, 1, dimensions); } public void recordResponseWaitingTime(double waitingTime) { @@ -311,7 +548,18 @@ public void recordFindUnhealthyHostRequest() { } public void recordKeyNum(int keyNum) { - keyNumSensor.record(keyNum); + recordKeyCountMetric(keyNum, RequestValidationOutcome.VALID); + } + + public void recordKeyCountMetric(int keyNum, RequestValidationOutcome outcome) { + Attributes dimensions = null; + if (emitOpenTelemetryMetrics) { + dimensions = Attributes.builder() + .putAll(commonMetricDimensions) + .put(getDimensionName(VENICE_REQUEST_VALIDATION_OUTCOME), outcome.getOutcome()) + .build(); + } + keyCountMetric.record(RouterTehutiMetricNameEnum.KEY_NUM, keyNum, dimensions); } public void recordRequestUsage(int usage) { @@ -335,19 +583,27 @@ public void recordUnavailableRequest() { } public void recordDelayConstraintAbortedRetryRequest() { - delayConstraintAbortedRetryRequest.record(); + recordAbortedRetrySensorOtel( + RouterTehutiMetricNameEnum.DELAY_CONSTRAINT_ABORTED_RETRY_REQUEST, + RequestRetryAbortReason.DELAY_CONSTRAINT); } public void recordSlowRouteAbortedRetryRequest() { - slowRouteAbortedRetryRequest.record(); + recordAbortedRetrySensorOtel( + RouterTehutiMetricNameEnum.SLOW_ROUTE_ABORTED_RETRY_REQUEST, + RequestRetryAbortReason.SLOW_ROUTE); } public void recordRetryRouteLimitAbortedRetryRequest() { - retryRouteLimitAbortedRetryRequest.record(); + recordAbortedRetrySensorOtel( + RouterTehutiMetricNameEnum.RETRY_ROUTE_LIMIT_ABORTED_RETRY_REQUEST, + RequestRetryAbortReason.MAX_RETRY_ROUTE_LIMIT); } public void recordNoAvailableReplicaAbortedRetryRequest() { - noAvailableReplicaAbortedRetryRequest.record(); + recordAbortedRetrySensorOtel( + RouterTehutiMetricNameEnum.NO_AVAILABLE_REPLICA_ABORTED_RETRY_REQUEST, + RequestRetryAbortReason.NO_AVAILABLE_REPLICA); } public void recordKeySizeInByte(long keySize) { @@ -358,7 +614,7 @@ public void recordKeySizeInByte(long keySize) { public void recordResponse() { /** - * We already report into the sensor when the request starts, in {@link #recordRequest()}, so at response time + * We already report into the sensor when the request starts, in {@link #recordIncomingRequest()}, so at response time * there is no need to record into the sensor again. We just want to maintain the bookkeeping. */ currentInFlightRequest.decrementAndGet(); @@ -366,11 +622,12 @@ public void recordResponse() { } public void recordAllowedRetryRequest() { - allowedRetryRequestSensor.record(); + allowedRetryCountMetric.record(RouterTehutiMetricNameEnum.ALLOWED_RETRY_REQUEST_COUNT, 1, commonMetricDimensions); } public void recordDisallowedRetryRequest() { - disallowedRetryRequestSensor.record(); + disallowedRetryCountMetric + .record(RouterTehutiMetricNameEnum.DISALLOWED_RETRY_REQUEST_COUNT, 1, commonMetricDimensions); } public void recordErrorRetryAttemptTriggeredByPendingRequestCheck() { @@ -378,7 +635,7 @@ public void recordErrorRetryAttemptTriggeredByPendingRequestCheck() { } public void recordRetryDelay(double delay) { - retryDelaySensor.record(delay); + retryDelayMetric.record(RouterTehutiMetricNameEnum.RETRY_DELAY, delay, commonMetricDimensions); } public void recordMetaStoreShadowRead() { @@ -390,9 +647,53 @@ protected Sensor registerSensor(String sensorName, MeasurableStat... stats) { return super.registerSensor(systemStoreName == null ? sensorName : systemStoreName, null, stats); } + /** + * This method will be passed to the constructor of {@link MetricEntityState} to register tehuti sensor. + * Only private/static/final methods can be passed onto the constructor. + */ + private Sensor registerSensorFinal(String sensorName, MeasurableStat... stats) { + return this.registerSensor(sensorName, stats); + } + static public boolean hasInFlightRequests() { Metric metric = localMetricRepo.getMetric("total_inflight_request_count"); // max return -infinity when there are no samples. validate only against finite value return Double.isFinite(metric.value()) ? metric.value() > 0.0 : false; } + + /** + * Metric names for tehuti metrics used in this class + */ + private enum RouterTehutiMetricNameEnum implements TehutiMetricNameEnum { + /** for {@link RouterMetricEntity#INCOMING_CALL_COUNT} */ + REQUEST, + /** for {@link RouterMetricEntity#CALL_COUNT} */ + HEALTHY_REQUEST, UNHEALTHY_REQUEST, TARDY_REQUEST, THROTTLED_REQUEST, BAD_REQUEST, + /** for {@link RouterMetricEntity#CALL_TIME} */ + HEALTHY_REQUEST_LATENCY, UNHEALTHY_REQUEST_LATENCY, TARDY_REQUEST_LATENCY, THROTTLED_REQUEST_LATENCY, + /** for {@link RouterMetricEntity#RETRY_COUNT} */ + ERROR_RETRY, + /** for {@link RouterMetricEntity#ALLOWED_RETRY_COUNT} */ + ALLOWED_RETRY_REQUEST_COUNT, + /** for {@link RouterMetricEntity#DISALLOWED_RETRY_COUNT} */ + DISALLOWED_RETRY_REQUEST_COUNT, + /** for {@link RouterMetricEntity#RETRY_DELAY} */ + RETRY_DELAY, + /** for {@link RouterMetricEntity#ABORTED_RETRY_COUNT} */ + DELAY_CONSTRAINT_ABORTED_RETRY_REQUEST, SLOW_ROUTE_ABORTED_RETRY_REQUEST, RETRY_ROUTE_LIMIT_ABORTED_RETRY_REQUEST, + NO_AVAILABLE_REPLICA_ABORTED_RETRY_REQUEST, + /** for {@link RouterMetricEntity#CALL_KEY_COUNT} */ + KEY_NUM, BAD_REQUEST_KEY_COUNT; + + private final String metricName; + + RouterTehutiMetricNameEnum() { + this.metricName = name().toLowerCase(); + } + + @Override + public String getMetricName() { + return this.metricName; + } + } } diff --git a/services/venice-router/src/main/java/com/linkedin/venice/router/stats/RouterMetricEntity.java b/services/venice-router/src/main/java/com/linkedin/venice/router/stats/RouterMetricEntity.java new file mode 100644 index 0000000000..f542e6dc83 --- /dev/null +++ b/services/venice-router/src/main/java/com/linkedin/venice/router/stats/RouterMetricEntity.java @@ -0,0 +1,88 @@ +package com.linkedin.venice.router.stats; + +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.HTTP_RESPONSE_STATUS_CODE; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.HTTP_RESPONSE_STATUS_CODE_CATEGORY; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_CLUSTER_NAME; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REQUEST_METHOD; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REQUEST_RETRY_ABORT_REASON; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REQUEST_RETRY_TYPE; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REQUEST_VALIDATION_OUTCOME; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_RESPONSE_STATUS_CODE_CATEGORY; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_NAME; +import static com.linkedin.venice.utils.Utils.setOf; + +import com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions; +import com.linkedin.venice.stats.metrics.MetricEntity; +import com.linkedin.venice.stats.metrics.MetricType; +import com.linkedin.venice.stats.metrics.MetricUnit; +import java.util.Set; + + +/** + * List all Metric entities for router + */ +public enum RouterMetricEntity { + INCOMING_CALL_COUNT( + "incoming_call_count", MetricType.COUNTER, MetricUnit.NUMBER, "Count of all incoming requests", + setOf(VENICE_STORE_NAME, VENICE_CLUSTER_NAME, VENICE_REQUEST_METHOD) + ), + CALL_COUNT( + "call_count", MetricType.COUNTER, MetricUnit.NUMBER, "Count of all requests with response details", + setOf( + VENICE_STORE_NAME, + VENICE_CLUSTER_NAME, + VENICE_REQUEST_METHOD, + HTTP_RESPONSE_STATUS_CODE, + HTTP_RESPONSE_STATUS_CODE_CATEGORY, + VENICE_RESPONSE_STATUS_CODE_CATEGORY) + ), + CALL_TIME( + "call_time", MetricType.HISTOGRAM, MetricUnit.MILLISECOND, "Latency based on all responses", + setOf( + VENICE_STORE_NAME, + VENICE_CLUSTER_NAME, + VENICE_REQUEST_METHOD, + HTTP_RESPONSE_STATUS_CODE_CATEGORY, + VENICE_RESPONSE_STATUS_CODE_CATEGORY) + ), + CALL_KEY_COUNT( + "call_key_count", MetricType.MIN_MAX_COUNT_SUM_AGGREGATIONS, MetricUnit.NUMBER, + "Count of keys in multi key requests", + setOf(VENICE_STORE_NAME, VENICE_CLUSTER_NAME, VENICE_REQUEST_METHOD, VENICE_REQUEST_VALIDATION_OUTCOME) + ), + RETRY_COUNT( + "retry_count", MetricType.COUNTER, MetricUnit.NUMBER, "Count of retries triggered", + setOf(VENICE_STORE_NAME, VENICE_CLUSTER_NAME, VENICE_REQUEST_METHOD, VENICE_REQUEST_RETRY_TYPE) + ), + ALLOWED_RETRY_COUNT( + "allowed_retry_count", MetricType.COUNTER, MetricUnit.NUMBER, "Count of allowed retry requests", + setOf(VENICE_STORE_NAME, VENICE_CLUSTER_NAME, VENICE_REQUEST_METHOD) + ), + DISALLOWED_RETRY_COUNT( + "disallowed_retry_count", MetricType.COUNTER, MetricUnit.NUMBER, "Count of disallowed retry requests", + setOf(VENICE_STORE_NAME, VENICE_CLUSTER_NAME, VENICE_REQUEST_METHOD) + ), + RETRY_DELAY( + "retry_delay", MetricType.MIN_MAX_COUNT_SUM_AGGREGATIONS, MetricUnit.MILLISECOND, "Retry delay time", + setOf(VENICE_STORE_NAME, VENICE_CLUSTER_NAME, VENICE_REQUEST_METHOD) + ), + ABORTED_RETRY_COUNT( + "aborted_retry_count", MetricType.COUNTER, MetricUnit.NUMBER, "Count of aborted retry requests", + setOf(VENICE_STORE_NAME, VENICE_CLUSTER_NAME, VENICE_REQUEST_METHOD, VENICE_REQUEST_RETRY_ABORT_REASON) + ); + + private final MetricEntity metricEntity; + + RouterMetricEntity( + String metricName, + MetricType metricType, + MetricUnit unit, + String description, + Set dimensionsList) { + this.metricEntity = new MetricEntity(metricName, metricType, unit, description, dimensionsList); + } + + public MetricEntity getMetricEntity() { + return metricEntity; + } +} diff --git a/services/venice-router/src/test/java/com/linkedin/venice/router/AggRouterHttpRequestStatsTest.java b/services/venice-router/src/test/java/com/linkedin/venice/router/AggRouterHttpRequestStatsTest.java index 3490d97483..333b37ca33 100644 --- a/services/venice-router/src/test/java/com/linkedin/venice/router/AggRouterHttpRequestStatsTest.java +++ b/services/venice-router/src/test/java/com/linkedin/venice/router/AggRouterHttpRequestStatsTest.java @@ -1,10 +1,12 @@ package com.linkedin.venice.router; +import static io.netty.handler.codec.http.HttpResponseStatus.TOO_MANY_REQUESTS; + import com.linkedin.venice.meta.ReadOnlyStoreRepository; import com.linkedin.venice.read.RequestType; import com.linkedin.venice.router.stats.AggRouterHttpRequestStats; +import com.linkedin.venice.stats.VeniceMetricsRepository; import com.linkedin.venice.tehuti.MockTehutiReporter; -import io.tehuti.metrics.MetricsRepository; import org.mockito.Mockito; import org.testng.Assert; import org.testng.annotations.BeforeSuite; @@ -12,13 +14,13 @@ public class AggRouterHttpRequestStatsTest { - MetricsRepository metricsRepository; + VeniceMetricsRepository metricsRepository; private MockTehutiReporter reporter; private ReadOnlyStoreRepository storeMetadataRepository; @BeforeSuite public void setUp() { - this.metricsRepository = new MetricsRepository(); + this.metricsRepository = new VeniceMetricsRepository(); reporter = new MockTehutiReporter(); metricsRepository.addReporter(reporter); storeMetadataRepository = Mockito.mock(ReadOnlyStoreRepository.class); @@ -26,8 +28,12 @@ public void setUp() { @Test public void testAggRouterMetrics() { - AggRouterHttpRequestStats stats = - new AggRouterHttpRequestStats(metricsRepository, RequestType.SINGLE_GET, storeMetadataRepository, true); + AggRouterHttpRequestStats stats = new AggRouterHttpRequestStats( + "test-cluster", + metricsRepository, + RequestType.SINGLE_GET, + storeMetadataRepository, + true); stats.recordRequest("store5"); Assert.assertEquals(reporter.query(".total--request.Count").value(), 1d); @@ -37,8 +43,8 @@ public void testAggRouterMetrics() { Assert.assertNotNull(metricsRepository.getMetric(".store1--request.Count")); Assert.assertEquals(reporter.query(".store1--request.Count").value(), 1d); - stats.recordThrottledRequest("store1", 1.0); - stats.recordThrottledRequest("store2", 1.0); + stats.recordThrottledRequest("store1", 1.0, TOO_MANY_REQUESTS); + stats.recordThrottledRequest("store2", 1.0, TOO_MANY_REQUESTS); stats.recordErrorRetryCount("store1"); Assert.assertEquals(reporter.query(".total--request.Count").value(), 2d); Assert.assertEquals(reporter.query(".store1--request.Count").value(), 1d); @@ -59,8 +65,13 @@ public void testAggRouterMetrics() { @Test public void testProfilingMetrics() { - AggRouterHttpRequestStats stats = - new AggRouterHttpRequestStats(metricsRepository, RequestType.COMPUTE, true, storeMetadataRepository, true); + AggRouterHttpRequestStats stats = new AggRouterHttpRequestStats( + "test-cluster", + metricsRepository, + RequestType.COMPUTE, + true, + storeMetadataRepository, + true); for (int i = 1; i <= 100; i += 1) { stats.recordKeySize("store1", i); diff --git a/services/venice-router/src/test/java/com/linkedin/venice/router/RouteHttpRequestStatsTest.java b/services/venice-router/src/test/java/com/linkedin/venice/router/RouteHttpRequestStatsTest.java index a364f4a166..36d7baeadc 100644 --- a/services/venice-router/src/test/java/com/linkedin/venice/router/RouteHttpRequestStatsTest.java +++ b/services/venice-router/src/test/java/com/linkedin/venice/router/RouteHttpRequestStatsTest.java @@ -22,13 +22,18 @@ public class RouteHttpRequestStatsTest { @BeforeSuite public void setUp() { - MetricsRepository metrics = MetricsRepositoryUtils.createSingleThreadedMetricsRepository(); + MetricsRepository metrics = MetricsRepositoryUtils.createSingleThreadedVeniceMetricsRepository(); reporter = new MockTehutiReporter(); metrics.addReporter(reporter); stats = new RouteHttpRequestStats(metrics, mock(StorageNodeClient.class)); - routerHttpRequestStats = - new RouterHttpRequestStats(metrics, "", RequestType.SINGLE_GET, mock(ScatterGatherStats.class), false); + routerHttpRequestStats = new RouterHttpRequestStats( + metrics, + "test-store", + "test-cluster", + RequestType.SINGLE_GET, + mock(ScatterGatherStats.class), + false); } @Test @@ -46,7 +51,7 @@ public void routerMetricsTest() { Assert.assertEquals(stats.getPendingRequestCount("my_host1"), 1); Assert.assertEquals(stats.getPendingRequestCount("my_host2"), 0); - routerHttpRequestStats.recordRequest(); + routerHttpRequestStats.recordIncomingRequest(); Assert.assertTrue(RouterHttpRequestStats.hasInFlightRequests()); } } diff --git a/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVeniceDelegateMode.java b/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVeniceDelegateMode.java index e558337e71..3669622691 100644 --- a/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVeniceDelegateMode.java +++ b/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVeniceDelegateMode.java @@ -39,10 +39,10 @@ import com.linkedin.venice.router.stats.RouterStats; import com.linkedin.venice.router.throttle.ReadRequestThrottler; import com.linkedin.venice.schema.avro.ReadAvroProtocolDefinition; +import com.linkedin.venice.stats.VeniceMetricsRepository; import com.linkedin.venice.utils.HelixUtils; import com.linkedin.venice.utils.Utils; import io.netty.handler.codec.http.HttpMethod; -import io.tehuti.metrics.MetricsRepository; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@ -225,7 +225,8 @@ public void setUp() { RouterExceptionAndTrackingUtils.setRouterStats( new RouterStats<>( requestType -> new AggRouterHttpRequestStats( - new MetricsRepository(), + "test-cluster", + new VeniceMetricsRepository(), requestType, mock(ReadOnlyStoreRepository.class), true))); @@ -378,7 +379,8 @@ public void testLeastLoadedOnSlowHosts() throws RouterException { config, new RouterStats<>( requestType -> new AggRouterHttpRequestStats( - new MetricsRepository(), + "test-cluster", + new VeniceMetricsRepository(), requestType, mock(ReadOnlyStoreRepository.class), true)), @@ -702,7 +704,7 @@ public void testScatterForMultiGetWithHelixAssistedRouting() throws RouterExcept doReturn(1).when(helixInstanceConfigRepository).getInstanceGroupId(instance4.getNodeId()); HelixGroupSelector helixGroupSelector = new HelixGroupSelector( - new MetricsRepository(), + new VeniceMetricsRepository(), helixInstanceConfigRepository, HelixGroupSelectionStrategyEnum.ROUND_ROBIN, mock(TimeoutProcessor.class)); diff --git a/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVenicePathParser.java b/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVenicePathParser.java index b86f169b50..f6d497a18b 100644 --- a/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVenicePathParser.java +++ b/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVenicePathParser.java @@ -34,6 +34,7 @@ import com.linkedin.venice.schema.avro.ReadAvroProtocolDefinition; import com.linkedin.venice.serializer.RecordSerializer; import com.linkedin.venice.serializer.SerializerDeserializerFactory; +import com.linkedin.venice.stats.VeniceMetricsRepository; import io.netty.buffer.ByteBuf; import io.netty.buffer.Unpooled; import io.netty.handler.codec.http.DefaultHttpHeaders; @@ -41,7 +42,6 @@ import io.netty.handler.codec.http.HttpHeaders; import io.netty.handler.codec.http.HttpMethod; import io.netty.handler.codec.http.HttpVersion; -import io.tehuti.metrics.MetricsRepository; import java.nio.ByteBuffer; import java.util.AbstractMap; import java.util.ArrayList; @@ -100,7 +100,8 @@ public void setUp() { RouterExceptionAndTrackingUtils.setRouterStats( new RouterStats<>( requestType -> new AggRouterHttpRequestStats( - new MetricsRepository(), + CLUSTER, + new VeniceMetricsRepository(), requestType, mock(ReadOnlyStoreRepository.class), true))); @@ -128,7 +129,7 @@ public void testParseResourceUri_ComputeRequest() throws RouterException { storeRepository, mock(VeniceRouterConfig.class), mock(CompressorFactory.class), - mock(MetricsRepository.class), + mock(VeniceMetricsRepository.class), mock(ScheduledExecutorService.class)); String storeName = "test-store"; @@ -190,7 +191,7 @@ public void parsesQueries() throws RouterException { mock(ReadOnlyStoreRepository.class), MOCK_ROUTER_CONFIG, compressorFactory, - mock(MetricsRepository.class), + mock(VeniceMetricsRepository.class), mock(ScheduledExecutorService.class)); BasicFullHttpRequest request = new BasicFullHttpRequest(HttpVersion.HTTP_1_1, HttpMethod.GET, uri, 0, 0); VenicePath path = parser.parseResourceUri(uri, request); @@ -221,7 +222,7 @@ public void parsesB64Uri() throws RouterException { mock(ReadOnlyStoreRepository.class), MOCK_ROUTER_CONFIG, compressorFactory, - mock(MetricsRepository.class), + mock(VeniceMetricsRepository.class), mock(ScheduledExecutorService.class)).parseResourceUri(myUri, request); ByteBuffer partitionKey = path.getPartitionKey().getKeyBuffer(); Assert.assertEquals( @@ -242,7 +243,7 @@ public void failsToParseOtherActions() throws RouterException { mock(ReadOnlyStoreRepository.class), MOCK_ROUTER_CONFIG, compressorFactory, - mock(MetricsRepository.class), + mock(VeniceMetricsRepository.class), mock(ScheduledExecutorService.class)).parseResourceUri("/badAction/storeName/key"); } @@ -289,7 +290,7 @@ public void parseRequestWithBatchSizeViolation() throws RouterException { storeRepository, MOCK_ROUTER_CONFIG, compressorFactory, - mock(MetricsRepository.class), + mock(VeniceMetricsRepository.class), mock(ScheduledExecutorService.class)); try { pathParser.parseResourceUri(myUri, request); diff --git a/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVeniceResponseAggregator.java b/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVeniceResponseAggregator.java index 3075773715..0d01bea6a2 100644 --- a/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVeniceResponseAggregator.java +++ b/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVeniceResponseAggregator.java @@ -219,7 +219,7 @@ public void testBuildResponseForMultiGet() { FullHttpResponse response5 = buildFullHttpResponse(TOO_MANY_REQUESTS, new byte[0], headers); metrics.setMetric(MetricNames.ROUTER_SERVER_TIME, new TimeValue(1, TimeUnit.MILLISECONDS)); responseAggregator.buildResponse(request, metrics, Collections.singletonList(response5)); - verify(mockStatsForMultiGet).recordThrottledRequest(storeName, 1.0); + verify(mockStatsForMultiGet).recordThrottledRequest(storeName, 1.0, TOO_MANY_REQUESTS); } @Test diff --git a/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVeniceVersionFinder.java b/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVeniceVersionFinder.java index 436ac12ffe..d70afcd8ac 100644 --- a/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVeniceVersionFinder.java +++ b/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVeniceVersionFinder.java @@ -28,11 +28,11 @@ import com.linkedin.venice.meta.VersionStatus; import com.linkedin.venice.meta.ZKStore; import com.linkedin.venice.router.stats.StaleVersionStats; +import com.linkedin.venice.stats.VeniceMetricsRepository; import com.linkedin.venice.utils.TestUtils; import com.linkedin.venice.utils.Utils; import io.netty.handler.codec.http.HttpMethod; import io.netty.handler.codec.http.HttpVersion; -import io.tehuti.metrics.MetricsRepository; import io.tehuti.metrics.Sensor; import java.nio.ByteBuffer; import java.util.HashMap; @@ -75,7 +75,7 @@ public void throws404onMissingStore() { clusterToD2Map, CLUSTER, compressorFactory, - mock(MetricsRepository.class)); + mock(VeniceMetricsRepository.class)); try { versionFinder.getVersion("", request); Assert.fail( @@ -115,7 +115,7 @@ public void throws301onMigratedStore() { clusterToD2Map, CLUSTER, compressorFactory, - mock(MetricsRepository.class)); + mock(VeniceMetricsRepository.class)); try { request.headers().add(HttpConstants.VENICE_ALLOW_REDIRECT, "1"); versionFinder.getVersion("store", request); @@ -150,7 +150,7 @@ public void returnNonExistingVersionOnceStoreIsDisabled() { clusterToD2Map, CLUSTER, compressorFactory, - mock(MetricsRepository.class)); + mock(VeniceMetricsRepository.class)); try { versionFinder.getVersion(storeName, request); Assert.fail("Store should be disabled and forbidden to read."); @@ -190,7 +190,7 @@ public void testSwapsVersionWhenAllPartitionsAreOnline() { HelixReadOnlyStoreConfigRepository storeConfigRepo = mock(HelixReadOnlyStoreConfigRepository.class); CompressorFactory compressorFactory = mock(CompressorFactory.class); - MetricsRepository mockMetricsRepository = mock(MetricsRepository.class); + VeniceMetricsRepository mockMetricsRepository = mock(VeniceMetricsRepository.class); final Sensor mockSensor = mock(Sensor.class); doReturn(mockSensor).when(mockMetricsRepository).sensor(anyString(), any()); @@ -277,7 +277,7 @@ public void returnsCurrentVersionWhenTheDictionaryExists() { clusterToD2Map, CLUSTER, compressorFactory, - mock(MetricsRepository.class)); + mock(VeniceMetricsRepository.class)); String firstVersionKafkaTopic = Version.composeKafkaTopic(storeName, firstVersion); @@ -326,7 +326,7 @@ public void returnsCurrentVersionWhenItIsTheOnlyOption() { clusterToD2Map, CLUSTER, compressorFactory, - mock(MetricsRepository.class)); + mock(VeniceMetricsRepository.class)); String firstVersionKafkaTopic = Version.composeKafkaTopic(storeName, firstVersion); @@ -361,7 +361,7 @@ public void returnsPreviousVersionWhenDictionaryNotDownloaded() { doReturn(true).when(routingDataRepo).containsKafkaTopic(anyString()); CompressorFactory compressorFactory = mock(CompressorFactory.class); - MetricsRepository mockMetricsRepository = mock(MetricsRepository.class); + VeniceMetricsRepository mockMetricsRepository = mock(VeniceMetricsRepository.class); final Sensor mockSensor = mock(Sensor.class); doReturn(mockSensor).when(mockMetricsRepository).sensor(anyString(), any()); @@ -419,7 +419,7 @@ public void returnsNewVersionWhenDictionaryDownloads() { doReturn(3).when(routingDataRepo).getNumberOfPartitions(anyString()); doReturn(instances).when(routingDataRepo).getReadyToServeInstances(anyString(), anyInt()); doReturn(true).when(routingDataRepo).containsKafkaTopic(anyString()); - MetricsRepository mockMetricsRepository = mock(MetricsRepository.class); + VeniceMetricsRepository mockMetricsRepository = mock(VeniceMetricsRepository.class); final Sensor mockSensor = mock(Sensor.class); doReturn(mockSensor).when(mockMetricsRepository).sensor(anyString(), any()); diff --git a/services/venice-router/src/test/java/com/linkedin/venice/router/api/path/TestVeniceMultiGetPath.java b/services/venice-router/src/test/java/com/linkedin/venice/router/api/path/TestVeniceMultiGetPath.java index 3a0cbc98f0..11f5b0d54b 100644 --- a/services/venice-router/src/test/java/com/linkedin/venice/router/api/path/TestVeniceMultiGetPath.java +++ b/services/venice-router/src/test/java/com/linkedin/venice/router/api/path/TestVeniceMultiGetPath.java @@ -20,11 +20,11 @@ import com.linkedin.venice.schema.avro.ReadAvroProtocolDefinition; import com.linkedin.venice.serializer.RecordSerializer; import com.linkedin.venice.serializer.SerializerDeserializerFactory; +import com.linkedin.venice.stats.VeniceMetricsRepository; import com.linkedin.venice.utils.Utils; import io.netty.buffer.Unpooled; import io.netty.handler.codec.http.HttpMethod; import io.netty.handler.codec.http.HttpVersion; -import io.tehuti.metrics.MetricsRepository; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; @@ -37,14 +37,18 @@ public class TestVeniceMultiGetPath { private final RetryManager disabledRetryManager = - new RetryManager(new MetricsRepository(), "disabled-test-retry-manager", 0, 0, null); + new RetryManager(new VeniceMetricsRepository(), "disabled-test-retry-manager", 0, 0, null); + + public TestVeniceMultiGetPath() { + } @BeforeClass public void setUp() { RouterExceptionAndTrackingUtils.setRouterStats( new RouterStats<>( requestType -> new AggRouterHttpRequestStats( - new MetricsRepository(), + "test-cluster", + new VeniceMetricsRepository(), requestType, mock(ReadOnlyStoreRepository.class), true))); diff --git a/services/venice-router/src/test/java/com/linkedin/venice/router/api/path/TestVenicePath.java b/services/venice-router/src/test/java/com/linkedin/venice/router/api/path/TestVenicePath.java index da07b2f926..eef224113d 100644 --- a/services/venice-router/src/test/java/com/linkedin/venice/router/api/path/TestVenicePath.java +++ b/services/venice-router/src/test/java/com/linkedin/venice/router/api/path/TestVenicePath.java @@ -10,12 +10,12 @@ import com.linkedin.venice.read.RequestType; import com.linkedin.venice.router.api.RouterKey; import com.linkedin.venice.schema.avro.ReadAvroProtocolDefinition; +import com.linkedin.venice.stats.VeniceMetricsRepository; import com.linkedin.venice.utils.TestMockTime; import com.linkedin.venice.utils.TestUtils; import com.linkedin.venice.utils.Time; import io.netty.handler.codec.http.HttpMethod; import io.netty.handler.codec.http.HttpResponseStatus; -import io.tehuti.metrics.MetricsRepository; import java.time.Clock; import java.util.Collection; import java.util.concurrent.ScheduledExecutorService; @@ -83,13 +83,13 @@ public String getLocation() { } private RetryManager disabledRetryManager; - private MetricsRepository metricsRepository; + private VeniceMetricsRepository metricsRepository; private final ScheduledExecutorService retryManagerScheduler = Executors.newScheduledThreadPool(1); @BeforeMethod public void setUp() { - metricsRepository = new MetricsRepository(); + metricsRepository = new VeniceMetricsRepository(); // retry manager is disabled by default disabledRetryManager = new RetryManager(metricsRepository, "disabled-test-retry-manager", 0, 0, retryManagerScheduler); diff --git a/services/venice-router/src/test/java/com/linkedin/venice/router/stats/AdminOperationsStatsTest.java b/services/venice-router/src/test/java/com/linkedin/venice/router/stats/AdminOperationsStatsTest.java index 57e82128ff..f420e0bf25 100644 --- a/services/venice-router/src/test/java/com/linkedin/venice/router/stats/AdminOperationsStatsTest.java +++ b/services/venice-router/src/test/java/com/linkedin/venice/router/stats/AdminOperationsStatsTest.java @@ -13,7 +13,7 @@ public class AdminOperationsStatsTest { @Test public void testAdminOperationsStats() { - MetricsRepository metrics = MetricsRepositoryUtils.createSingleThreadedMetricsRepository(); + MetricsRepository metrics = MetricsRepositoryUtils.createSingleThreadedVeniceMetricsRepository(); MockTehutiReporter reporter = new MockTehutiReporter(); metrics.addReporter(reporter); VeniceRouterConfig mockConfig = mock(VeniceRouterConfig.class); diff --git a/services/venice-router/src/test/java/com/linkedin/venice/router/stats/RouterMetricEntityTest.java b/services/venice-router/src/test/java/com/linkedin/venice/router/stats/RouterMetricEntityTest.java new file mode 100644 index 0000000000..8755ba2aeb --- /dev/null +++ b/services/venice-router/src/test/java/com/linkedin/venice/router/stats/RouterMetricEntityTest.java @@ -0,0 +1,184 @@ +package com.linkedin.venice.router.stats; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertTrue; + +import com.linkedin.venice.router.RouterServer; +import com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions; +import com.linkedin.venice.stats.metrics.MetricEntity; +import com.linkedin.venice.stats.metrics.MetricType; +import com.linkedin.venice.stats.metrics.MetricUnit; +import com.linkedin.venice.utils.Utils; +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; +import org.testng.annotations.Test; + + +public class RouterMetricEntityTest { + @Test + public void testRouterMetricEntities() { + Map expectedMetrics = new HashMap<>(); + + expectedMetrics.put( + RouterMetricEntity.INCOMING_CALL_COUNT, + new MetricEntity( + "incoming_call_count", + MetricType.COUNTER, + MetricUnit.NUMBER, + "Count of all incoming requests", + Utils.setOf( + VeniceMetricsDimensions.VENICE_STORE_NAME, + VeniceMetricsDimensions.VENICE_CLUSTER_NAME, + VeniceMetricsDimensions.VENICE_REQUEST_METHOD))); + expectedMetrics.put( + RouterMetricEntity.CALL_COUNT, + new MetricEntity( + "call_count", + MetricType.COUNTER, + MetricUnit.NUMBER, + "Count of all requests with response details", + Utils.setOf( + VeniceMetricsDimensions.VENICE_STORE_NAME, + VeniceMetricsDimensions.VENICE_CLUSTER_NAME, + VeniceMetricsDimensions.VENICE_REQUEST_METHOD, + VeniceMetricsDimensions.HTTP_RESPONSE_STATUS_CODE, + VeniceMetricsDimensions.HTTP_RESPONSE_STATUS_CODE_CATEGORY, + VeniceMetricsDimensions.VENICE_RESPONSE_STATUS_CODE_CATEGORY))); + expectedMetrics.put( + RouterMetricEntity.CALL_TIME, + new MetricEntity( + "call_time", + MetricType.HISTOGRAM, + MetricUnit.MILLISECOND, + "Latency based on all responses", + Utils.setOf( + VeniceMetricsDimensions.VENICE_STORE_NAME, + VeniceMetricsDimensions.VENICE_CLUSTER_NAME, + VeniceMetricsDimensions.VENICE_REQUEST_METHOD, + VeniceMetricsDimensions.HTTP_RESPONSE_STATUS_CODE_CATEGORY, + VeniceMetricsDimensions.VENICE_RESPONSE_STATUS_CODE_CATEGORY))); + expectedMetrics.put( + RouterMetricEntity.CALL_KEY_COUNT, + new MetricEntity( + "call_key_count", + MetricType.MIN_MAX_COUNT_SUM_AGGREGATIONS, + MetricUnit.NUMBER, + "Count of keys in multi key requests", + Utils.setOf( + VeniceMetricsDimensions.VENICE_STORE_NAME, + VeniceMetricsDimensions.VENICE_CLUSTER_NAME, + VeniceMetricsDimensions.VENICE_REQUEST_METHOD, + VeniceMetricsDimensions.VENICE_REQUEST_VALIDATION_OUTCOME))); + expectedMetrics.put( + RouterMetricEntity.RETRY_COUNT, + new MetricEntity( + "retry_count", + MetricType.COUNTER, + MetricUnit.NUMBER, + "Count of retries triggered", + Utils.setOf( + VeniceMetricsDimensions.VENICE_STORE_NAME, + VeniceMetricsDimensions.VENICE_CLUSTER_NAME, + VeniceMetricsDimensions.VENICE_REQUEST_METHOD, + VeniceMetricsDimensions.VENICE_REQUEST_RETRY_TYPE))); + expectedMetrics.put( + RouterMetricEntity.ALLOWED_RETRY_COUNT, + new MetricEntity( + "allowed_retry_count", + MetricType.COUNTER, + MetricUnit.NUMBER, + "Count of allowed retry requests", + Utils.setOf( + VeniceMetricsDimensions.VENICE_STORE_NAME, + VeniceMetricsDimensions.VENICE_CLUSTER_NAME, + VeniceMetricsDimensions.VENICE_REQUEST_METHOD))); + expectedMetrics.put( + RouterMetricEntity.DISALLOWED_RETRY_COUNT, + new MetricEntity( + "disallowed_retry_count", + MetricType.COUNTER, + MetricUnit.NUMBER, + "Count of disallowed retry requests", + Utils.setOf( + VeniceMetricsDimensions.VENICE_STORE_NAME, + VeniceMetricsDimensions.VENICE_CLUSTER_NAME, + VeniceMetricsDimensions.VENICE_REQUEST_METHOD))); + expectedMetrics.put( + RouterMetricEntity.RETRY_DELAY, + new MetricEntity( + "retry_delay", + MetricType.MIN_MAX_COUNT_SUM_AGGREGATIONS, + MetricUnit.MILLISECOND, + "Retry delay time", + Utils.setOf( + VeniceMetricsDimensions.VENICE_STORE_NAME, + VeniceMetricsDimensions.VENICE_CLUSTER_NAME, + VeniceMetricsDimensions.VENICE_REQUEST_METHOD))); + expectedMetrics.put( + RouterMetricEntity.ABORTED_RETRY_COUNT, + new MetricEntity( + "aborted_retry_count", + MetricType.COUNTER, + MetricUnit.NUMBER, + "Count of aborted retry requests", + Utils.setOf( + VeniceMetricsDimensions.VENICE_STORE_NAME, + VeniceMetricsDimensions.VENICE_CLUSTER_NAME, + VeniceMetricsDimensions.VENICE_REQUEST_METHOD, + VeniceMetricsDimensions.VENICE_REQUEST_RETRY_ABORT_REASON))); + + for (RouterMetricEntity metric: RouterMetricEntity.values()) { + MetricEntity actual = metric.getMetricEntity(); + MetricEntity expected = expectedMetrics.get(metric); + + assertNotNull(expected, "No expected definition for " + metric.name()); + assertNotNull(actual.getMetricName(), "Metric name should not be null for " + metric.name()); + assertEquals(actual.getMetricName(), expected.getMetricName(), "Unexpected metric name for " + metric.name()); + assertNotNull(actual.getMetricType(), "Metric type should not be null for " + metric.name()); + assertEquals(actual.getMetricType(), expected.getMetricType(), "Unexpected metric type for " + metric.name()); + assertNotNull(actual.getUnit(), "Metric unit should not be null for " + metric.name()); + assertEquals(actual.getUnit(), expected.getUnit(), "Unexpected metric unit for " + metric.name()); + assertNotNull(actual.getDescription(), "Metric description should not be null for " + metric.name()); + assertEquals( + actual.getDescription(), + expected.getDescription(), + "Unexpected metric description for " + metric.name()); + assertNotNull(actual.getDimensionsList(), "Metric dimensions should not be null for " + metric.name()); + assertEquals( + actual.getDimensionsList(), + expected.getDimensionsList(), + "Unexpected metric dimensions for " + metric.name()); + } + + // Convert expectedMetrics to a Collection for comparison + Collection expectedMetricEntities = expectedMetrics.values(); + + // Assert size + assertEquals( + RouterServer.ROUTER_SERVICE_METRIC_ENTITIES.size(), + expectedMetricEntities.size(), + "Unexpected size of ROUTER_SERVICE_METRIC_ENTITIES"); + + // Assert contents + for (MetricEntity actual: RouterServer.ROUTER_SERVICE_METRIC_ENTITIES) { + boolean found = false; + for (MetricEntity expected: expectedMetricEntities) { + if (metricEntitiesEqual(actual, expected)) { + found = true; + break; + } + } + assertTrue(found, "Unexpected MetricEntity found: " + actual.getMetricName()); + } + } + + private boolean metricEntitiesEqual(MetricEntity actual, MetricEntity expected) { + return Objects.equals(actual.getMetricName(), expected.getMetricName()) + && actual.getMetricType() == expected.getMetricType() && actual.getUnit() == expected.getUnit() + && Objects.equals(actual.getDescription(), expected.getDescription()) + && Objects.equals(actual.getDimensionsList(), expected.getDimensionsList()); + } +} diff --git a/services/venice-server/src/main/java/com/linkedin/venice/listener/HttpChannelInitializer.java b/services/venice-server/src/main/java/com/linkedin/venice/listener/HttpChannelInitializer.java index 50aa434264..fdca044c42 100644 --- a/services/venice-server/src/main/java/com/linkedin/venice/listener/HttpChannelInitializer.java +++ b/services/venice-server/src/main/java/com/linkedin/venice/listener/HttpChannelInitializer.java @@ -83,6 +83,7 @@ public HttpChannelInitializer( boolean isUnregisterMetricForDeletedStoreEnabled = serverConfig.isUnregisterMetricForDeletedStoreEnabled(); this.singleGetStats = new AggServerHttpRequestStats( + serverConfig.getClusterName(), metricsRepository, RequestType.SINGLE_GET, isKeyValueProfilingEnabled, @@ -90,6 +91,7 @@ public HttpChannelInitializer( isUnregisterMetricForDeletedStoreEnabled, isDaVinciClient); this.multiGetStats = new AggServerHttpRequestStats( + serverConfig.getClusterName(), metricsRepository, RequestType.MULTI_GET, isKeyValueProfilingEnabled, @@ -97,6 +99,7 @@ public HttpChannelInitializer( isUnregisterMetricForDeletedStoreEnabled, isDaVinciClient); this.computeStats = new AggServerHttpRequestStats( + serverConfig.getClusterName(), metricsRepository, RequestType.COMPUTE, isKeyValueProfilingEnabled, @@ -127,7 +130,7 @@ public HttpChannelInitializer( if (serverConfig.isQuotaEnforcementEnabled()) { String nodeId = Utils.getHelixNodeIdentifier(serverConfig.getListenerHostname(), serverConfig.getListenerPort()); - this.quotaUsageStats = new AggServerQuotaUsageStats(metricsRepository); + this.quotaUsageStats = new AggServerQuotaUsageStats(serverConfig.getClusterName(), metricsRepository); this.quotaEnforcer = new ReadQuotaEnforcementHandler( serverConfig, storeMetadataRepository, diff --git a/services/venice-server/src/main/java/com/linkedin/venice/server/VeniceServer.java b/services/venice-server/src/main/java/com/linkedin/venice/server/VeniceServer.java index 3175f1d75f..d60cce4bdb 100644 --- a/services/venice-server/src/main/java/com/linkedin/venice/server/VeniceServer.java +++ b/services/venice-server/src/main/java/com/linkedin/venice/server/VeniceServer.java @@ -326,7 +326,8 @@ private List createServices() { services.add(storageService); // Create stats for RocksDB - storageService.getRocksDBAggregatedStatistics().ifPresent(stat -> new AggRocksDBStats(metricsRepository, stat)); + storageService.getRocksDBAggregatedStatistics() + .ifPresent(stat -> new AggRocksDBStats(serverConfig.getClusterName(), metricsRepository, stat)); compressorFactory = new StorageEngineBackedCompressorFactory(storageMetadataService); diff --git a/services/venice-server/src/main/java/com/linkedin/venice/stats/AggRocksDBStats.java b/services/venice-server/src/main/java/com/linkedin/venice/stats/AggRocksDBStats.java index e9b504a3d4..a75303e2ca 100644 --- a/services/venice-server/src/main/java/com/linkedin/venice/stats/AggRocksDBStats.java +++ b/services/venice-server/src/main/java/com/linkedin/venice/stats/AggRocksDBStats.java @@ -8,8 +8,12 @@ * Right now, Venice SN only reports aggregated metrics for RocksDB. */ public class AggRocksDBStats extends AbstractVeniceAggStats { - public AggRocksDBStats(MetricsRepository metricsRepository, Statistics aggStat) { - super(metricsRepository, (metricsRepo, storeName) -> new RocksDBStats(metricsRepository, storeName)); + public AggRocksDBStats(String cluster, MetricsRepository metricsRepository, Statistics aggStat) { + super( + cluster, + metricsRepository, + (metricsRepo, storeName, clusterName) -> new RocksDBStats(metricsRepository, storeName), + false); totalStats.setRocksDBStat(aggStat); } } diff --git a/services/venice-server/src/main/java/com/linkedin/venice/stats/AggServerHttpRequestStats.java b/services/venice-server/src/main/java/com/linkedin/venice/stats/AggServerHttpRequestStats.java index 8305d622d6..d3eadfb5ca 100644 --- a/services/venice-server/src/main/java/com/linkedin/venice/stats/AggServerHttpRequestStats.java +++ b/services/venice-server/src/main/java/com/linkedin/venice/stats/AggServerHttpRequestStats.java @@ -12,6 +12,7 @@ */ public class AggServerHttpRequestStats extends AbstractVeniceAggStoreStats { public AggServerHttpRequestStats( + String clusterName, MetricsRepository metricsRepository, RequestType requestType, boolean isKeyValueProfilingEnabled, @@ -19,10 +20,12 @@ public AggServerHttpRequestStats( boolean unregisterMetricForDeletedStoreEnabled, boolean isDaVinciClient) { super( + clusterName, metricsRepository, new ServerHttpRequestStatsSupplier(requestType, isKeyValueProfilingEnabled, isDaVinciClient), metadataRepository, - unregisterMetricForDeletedStoreEnabled); + unregisterMetricForDeletedStoreEnabled, + false); } static class ServerHttpRequestStatsSupplier implements StatsSupplier { @@ -41,7 +44,7 @@ static class ServerHttpRequestStatsSupplier implements StatsSupplier { private static final int SINGLE_VERSION_FOR_TOTAL_STATS = 1; - public AggServerQuotaUsageStats(MetricsRepository metricsRepository) { - super(metricsRepository, (metrics, storeName) -> new ServerReadQuotaUsageStats(metrics, storeName)); + public AggServerQuotaUsageStats(String cluster, MetricsRepository metricsRepository) { + super( + cluster, + metricsRepository, + (metrics, storeName, clusterName) -> new ServerReadQuotaUsageStats(metrics, storeName), + false); totalStats.setCurrentVersion(SINGLE_VERSION_FOR_TOTAL_STATS); } diff --git a/services/venice-server/src/test/java/com/linkedin/venice/listener/ReadQuotaEnforcementHandlerTest.java b/services/venice-server/src/test/java/com/linkedin/venice/listener/ReadQuotaEnforcementHandlerTest.java index 1cae4ef4f3..0bd9b7e893 100644 --- a/services/venice-server/src/test/java/com/linkedin/venice/listener/ReadQuotaEnforcementHandlerTest.java +++ b/services/venice-server/src/test/java/com/linkedin/venice/listener/ReadQuotaEnforcementHandlerTest.java @@ -373,7 +373,7 @@ public void testInitWithPreExistingResource() { storeRepository, CompletableFuture.completedFuture(customizedViewRepository), thisNodeId, - new AggServerQuotaUsageStats(metricsRepository), + new AggServerQuotaUsageStats(serverConfig.getClusterName(), metricsRepository), metricsRepository, clock); String storeName = "testStore"; diff --git a/services/venice-server/src/test/java/com/linkedin/venice/stats/AggServerHttpRequestStatsTest.java b/services/venice-server/src/test/java/com/linkedin/venice/stats/AggServerHttpRequestStatsTest.java index b1922b5cb2..d6993e186b 100644 --- a/services/venice-server/src/test/java/com/linkedin/venice/stats/AggServerHttpRequestStatsTest.java +++ b/services/venice-server/src/test/java/com/linkedin/venice/stats/AggServerHttpRequestStatsTest.java @@ -30,6 +30,7 @@ public void setUp() { this.reporter = new MockTehutiReporter(); this.metricsRepository.addReporter(reporter); this.singleGetStats = new AggServerHttpRequestStats( + "test_cluster", metricsRepository, RequestType.SINGLE_GET, false, @@ -37,6 +38,7 @@ public void setUp() { true, false); this.batchGetStats = new AggServerHttpRequestStats( + "test_cluster", metricsRepository, RequestType.MULTI_GET, false, diff --git a/services/venice-server/src/test/java/com/linkedin/venice/stats/AggServerReadQuotaUsageStatsTest.java b/services/venice-server/src/test/java/com/linkedin/venice/stats/AggServerReadQuotaUsageStatsTest.java index 8b80452587..3745e200eb 100644 --- a/services/venice-server/src/test/java/com/linkedin/venice/stats/AggServerReadQuotaUsageStatsTest.java +++ b/services/venice-server/src/test/java/com/linkedin/venice/stats/AggServerReadQuotaUsageStatsTest.java @@ -16,7 +16,7 @@ public void testAggServerQuotaUsageStats() { long start = System.currentTimeMillis(); doReturn(start).when(mockTime).milliseconds(); MetricsRepository metricsRepository = new MetricsRepository(); - AggServerQuotaUsageStats aggServerQuotaUsageStats = new AggServerQuotaUsageStats(metricsRepository); + AggServerQuotaUsageStats aggServerQuotaUsageStats = new AggServerQuotaUsageStats("test_cluster", metricsRepository); String storeName = "testStore"; String storeName2 = "testStore2"; String currentReadQuotaRequestedQPSString = "." + storeName + "--current_quota_request.Gauge";