Merge branch 'master' into deprecate-ofn

apache · Nov 27, 2023 · f722348 · f722348
2 parents 5841bff + 173c290
commit f722348
Show file tree

Hide file tree

Showing 9 changed files with 274 additions and 182 deletions.
diff --git a/docs/en/changes/changes.md b/docs/en/changes/changes.md
@@ -48,6 +48,8 @@
 * Replace Metrics v2 protocol with MQE in UI templates and E2E Test.
 * Fix incorrect apisix metrics otel rules.
 * Support `Scratch The OAP Config Dump`.
+* Group service endpoints into `_abandoned` when endpoints have high
+  cardinality.
 
 #### UI
 
@@ -70,7 +72,11 @@
 * Add Nginx menu i18n.
 * Fix the height for trace widget.
 * Polish list style.
-* Fixes Log associate with Trace.
+* Fix Log associate with Trace.
+* Enhance layout for broken Topology widget.
+* Fix calls metric with call type for Topology widget.
+* Fix changing metrics config for Topology widget.
+* Fix routes for Tab widget.
 * Remove OpenFunction(FAAS layer) relative UI templates and menu item.
 
 #### Documentation

diff --git a/...rc/main/java/org/apache/skywalking/oap/server/core/config/group/EndpointNameGrouping.java b/...rc/main/java/org/apache/skywalking/oap/server/core/config/group/EndpointNameGrouping.java
@@ -18,17 +18,18 @@
 
 package org.apache.skywalking.oap.server.core.config.group;
 
-import io.vavr.Tuple2;
 import java.io.IOException;
+import java.time.Duration;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Map;
+import java.util.Queue;
+import java.util.Set;
 import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.Executors;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
-import lombok.Setter;
-import lombok.extern.slf4j.Slf4j;
 import org.apache.skywalking.oap.server.ai.pipeline.services.api.HttpUriPattern;
 import org.apache.skywalking.oap.server.ai.pipeline.services.api.HttpUriRecognition;
 import org.apache.skywalking.oap.server.core.config.group.openapi.EndpointGroupingRule4Openapi;
@@ -37,9 +38,17 @@
 import org.apache.skywalking.oap.server.library.util.CollectionUtils;
 import org.apache.skywalking.oap.server.library.util.RunnableWithExceptionProtection;
 import org.apache.skywalking.oap.server.library.util.StringFormatGroup;
+import com.google.common.cache.CacheBuilder;
+import com.google.common.cache.CacheLoader;
+import com.google.common.cache.LoadingCache;
+import io.vavr.Tuple2;
+import lombok.Setter;
+import lombok.extern.slf4j.Slf4j;
 
 @Slf4j
 public class EndpointNameGrouping {
+    public static final String ABANDONED_ENDPOINT_NAME = "_abandoned";
+
     /**
      * Endpoint grouping according to local endpoint-name-grouping.yml or associated dynamic configuration.
      */
@@ -61,7 +70,14 @@ public class EndpointNameGrouping {
      * If the URI is formatted by the rules, the value would be the first 10 formatted names.
      * If the URI is unformatted, the value would be an empty queue.
      */
-    private ConcurrentHashMap<String, ConcurrentHashMap<String, ArrayBlockingQueue<String>>> cachedHttpUris = new ConcurrentHashMap<>();
+    private final Map<String/* service */, Map<String/* uri */, Queue<String>/* candidate patterns */>> cachedHttpUris = new ConcurrentHashMap<>();
+    private final LoadingCache<String/* service */, Set<String>/* unformatted uris */> unformattedHttpUrisCache = 
+        CacheBuilder.newBuilder().expireAfterWrite(Duration.ofMinutes(10)).build(new CacheLoader<>() {
+            @Override
+            public Set<String> load(String service) {
+                return ConcurrentHashMap.newKeySet();
+            }
+        });
     private final AtomicInteger aiPipelineExecutionCounter = new AtomicInteger(0);
     /**
      * The max number of HTTP URIs per service for further URI pattern recognition.
@@ -90,7 +106,7 @@ public Tuple2<String, Boolean> format(String serviceName, String endpointName) {
         if (!formattedName._2() && quickUriGroupingRule != null) {
             formattedName = formatByQuickUriPattern(serviceName, endpointName);
 
-            ConcurrentHashMap<String, ArrayBlockingQueue<String>> svrHttpUris =
+            Map<String, Queue<String>> svrHttpUris =
                 cachedHttpUris.computeIfAbsent(serviceName, k -> new ConcurrentHashMap<>());
 
             // Only cache first N (determined by maxHttpUrisNumberPerService) URIs per 30 mins.
@@ -99,18 +115,29 @@ public Tuple2<String, Boolean> format(String serviceName, String endpointName) {
                     // Algorithm side should not return a pattern that has no {var} in it else this
                     // code may accidentally retrieve the size 1 queue created by unformatted endpoint
                     // The queue size is 10, which means only cache the first 10 formatted names.
-                    final ArrayBlockingQueue<String> formattedURIs = svrHttpUris.computeIfAbsent(
+                    final Queue<String> formattedURIs = svrHttpUris.computeIfAbsent(
                         formattedName._1(), k -> new ArrayBlockingQueue<>(10));
-                    if (formattedURIs.size() < 10) {
-                        // Try to push the raw URI as a candidate of formatted name.
-                        formattedURIs.offer(endpointName);
-                    }
+                    // Try to push the raw URI as a candidate of formatted name.
+                    formattedURIs.offer(endpointName);
                 } else {
                     svrHttpUris.computeIfAbsent(endpointName, k -> new ArrayBlockingQueue<>(1));
                 }
             }
         }
 
+        // If there are too many unformatted URIs, we will abandon the unformatted URIs to reduce
+        // the load of OAP and storage.
+        final var unformattedUrisOfService = unformattedHttpUrisCache.getUnchecked(serviceName);
+        if (!formattedName._2()) {
+            if (unformattedUrisOfService.size() < maxHttpUrisNumberPerService) {
+                unformattedUrisOfService.add(endpointName);
+            } else {
+                formattedName = new Tuple2<>(ABANDONED_ENDPOINT_NAME, true);
+            }
+        } else {
+            unformattedUrisOfService.remove(endpointName);
+        }
+
         return formattedName;
     }
 
@@ -204,8 +231,6 @@ public void startHttpUriRecognitionSvr(final HttpUriRecognition httpUriRecogniti
                                              final List<HttpUriPattern> patterns
                                                  = httpUriRecognitionSvr.fetchAllPatterns(service.getName());
                                              if (CollectionUtils.isNotEmpty(patterns)) {
-                                                 StringFormatGroup group = new StringFormatGroup(
-                                                     patterns.size());
                                                  patterns.forEach(
                                                      p -> quickUriGroupingRule.addRule(
                                                          service.getName(), p.getPattern()));

diff --git a/.../server-starter/src/main/resources/ui-initialized-templates/general/general-endpoint.json b/.../server-starter/src/main/resources/ui-initialized-templates/general/general-endpoint.json
@@ -287,47 +287,67 @@
                   "linkDashboard": "General-Endpoint-Relation",
                   "nodeDashboard": "General-Endpoint",
                   "linkServerMetrics": [
-                    "endpoint_relation_sla",
-                    "endpoint_relation_resp_time",
-                    "endpoint_relation_cpm"
+
+                  ],
+                  "linkClientMetrics": [
+
                   ],
-                  "linkClientMetrics": [],
                   "nodeMetrics": [
-                    "endpoint_sla",
-                    "endpoint_resp_time",
-                    "endpoint_cpm"
+
                   ],
-                  "legend": [],
-                  "description": {},
+                  "legend": [
+
+                  ],
+                  "description": {
+
+                  },
                   "linkServerMetricConfig": [
                     {
-                      "unit": "%",
                       "label": "Success Rate",
-                      "calculation": "percentage"
+                      "unit": "%"
                     },
                     {
-                      "unit": "ms",
-                      "label": "Latency"
+                      "label": "Latency",
+                      "unit": "ms"
                     },
                     {
-                      "unit": "calls / min",
-                      "label": "Load"
+                      "label": "Load",
+                      "unit": "calls / min"
                     }
                   ],
                   "nodeMetricConfig": [
                     {
-                      "unit": "%",
                       "label": "Success Rate",
-                      "calculation": "percentage"
+                      "unit": "%"
                     },
                     {
-                      "unit": "ms",
-                      "label": "Latency"
+                      "label": "Latency",
+                      "unit": "ms"
                     },
                     {
-                      "unit": "calls / min ",
-                      "label": "Load"
+                      "label": "Load",
+                      "unit": "calls / min"
                     }
+                  ],
+                  "linkServerExpressions": [
+                    "avg(endpoint_relation_sla/100)",
+                    "avg(endpoint_relation_resp_time)",
+                    "avg(endpoint_relation_cpm)"
+                  ],
+                  "linkClientExpressions": [
+
+                  ],
+                  "nodeExpressions": [
+                    "avg(endpoint_sla/100)",
+                    "avg(endpoint_resp_time)",
+                    "avg(endpoint_cpm)"
+                  ],
+                  "metricMode": "Expression",
+                  "legendMQE": {
+                    "expression": ""
+                  },
+                  "linkClientMetricConfig": [
+
                   ]
                 }
               ]

diff --git a/...rver/server-starter/src/main/resources/ui-initialized-templates/general/general-root.json b/...rver/server-starter/src/main/resources/ui-initialized-templates/general/general-root.json
@@ -108,69 +108,72 @@
                     }
                   ],
                   "linkServerMetrics": [
-                    "service_relation_server_resp_time",
-                    "service_relation_server_cpm"
+
                   ],
                   "linkClientMetrics": [
-                    "service_relation_client_cpm",
-                    "service_relation_client_resp_time"
+
                   ],
                   "nodeMetrics": [
-                    "service_cpm",
-                    "service_sla",
-                    "service_resp_time"
+
                   ],
                   "legend": [
-                    {
-                      "name": "service_sla",
-                      "condition": "<",
-                      "value": "9500"
-                    },
-                    {
-                      "name": "service_cpm",
-                      "condition": ">",
-                      "value": "1"
-                    }
+
                   ],
                   "description": {
                     "healthy": "Healthy",
                     "unhealthy": "Success Rate < 95% and Traffic > 1 calls / min"
                   },
                   "nodeMetricConfig": [
                     {
-                      "unit": "calls / min",
-                      "label": "Load"
+                      "label": "Load",
+                      "unit": "calls / min"
                     },
                     {
-                      "calculation": "percentage",
                       "unit": "%",
                       "label": "Success Rate"
                     },
                     {
-                      "unit": "ms",
-                      "label": "Latency"
+                      "label": "Latency",
+                      "unit": "ms"
                     }
                   ],
                   "linkServerMetricConfig": [
                     {
-                      "unit": "ms",
-                      "label": "Server Latency"
+                      "label": "Server Latency",
+                      "unit": "ms"
                     },
                     {
-                      "unit": "calls / min",
-                      "label": "Server Load"
+                      "label": "Server Load",
+                      "unit": "calls / min"
                     }
                   ],
                   "linkClientMetricConfig": [
                     {
-                      "unit": "calls / min",
-                      "label": "Client Load"
+                      "label": "Client Latency",
+                      "unit": "ms"
                     },
                     {
-                      "unit": "ms",
-                      "label": "Client Latency"
+                      "label": "Client Load",
+                      "unit": "calls / min"
                     }
-                  ]
+                  ],
+                  "linkServerExpressions": [
+                    "avg(service_relation_server_resp_time)",
+                    "avg(service_relation_server_cpm)"
+                  ],
+                  "linkClientExpressions": [
+                    "avg(service_relation_client_resp_time)",
+                    "avg(service_relation_client_cpm)"
+                  ],
+                  "nodeExpressions": [
+                    "avg(service_cpm)",
+                    "avg(service_sla/100)",
+                    "avg(service_resp_time)"
+                  ],
+                  "metricMode": "Expression",
+                  "legendMQE": {
+                    "expression": "avg(service_sla < 9500) and avg(service_cpm) > 1"
+                  }
                 }
               ]
             },