Skip to content

Commit

Permalink
Merge branch 'master' into feature/20231013_zgc_support
Browse files Browse the repository at this point in the history
  • Loading branch information
Z-Beatles authored Oct 20, 2023
2 parents acaf256 + 062b31f commit 07264ef
Show file tree
Hide file tree
Showing 10 changed files with 170 additions and 38 deletions.
1 change: 1 addition & 0 deletions docs/en/changes/changes.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
* Fix `AlarmCore` doAlarm: catch exception for each callback to avoid interruption.
* Optimize queryBasicTraces in TraceQueryEsDAO.
* Fix `WebhookCallback` send incorrect messages, add catch exception for each callback HTTP Post.
* Fix AlarmRule expression validation: add labeled metrics mock data for check.
* Support collect ZGC memory pool metrics.

#### UI
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import lombok.AccessLevel;
import lombok.Data;
Expand All @@ -39,7 +38,6 @@
import org.apache.skywalking.mqe.rt.type.ExpressionResult;
import org.apache.skywalking.mqe.rt.type.ExpressionResultType;
import org.apache.skywalking.oap.server.core.alarm.provider.expr.rt.AlarmMQEVerifyVisitor;
import org.apache.skywalking.oap.server.core.storage.annotation.Column;
import org.apache.skywalking.oap.server.core.storage.annotation.ValueColumnMetadata;
import org.apache.skywalking.oap.server.library.util.StringUtil;

Expand Down Expand Up @@ -95,25 +93,11 @@ public void setExpression(final String expression) throws IllegalExpressionExcep
private void verifyIncludeMetrics(Set<String> includeMetrics, String expression) throws IllegalExpressionException {
Set<String> scopeSet = new HashSet<>();
for (String metricName : includeMetrics) {
Optional<ValueColumnMetadata.ValueColumn> valueColumn = ValueColumnMetadata.INSTANCE.readValueColumnDefinition(
metricName);
if (valueColumn.isEmpty()) {
throw new IllegalExpressionException("Metric: [" + metricName + "] dose not exist.");
}
scopeSet.add(ValueColumnMetadata.INSTANCE.getScope(metricName).name());
Column.ValueDataType dataType = valueColumn.get().getDataType();
switch (dataType) {
case COMMON_VALUE:
case LABELED_VALUE:
break;
default:
throw new IllegalExpressionException(
"Metric dose not supported in alarm, metric: [" + metricName + "] is not a common or labeled metric.");
}
}
if (scopeSet.size() != 1) {
throw new IllegalExpressionException(
"The metrics in expression: " + expression + " must have the same scope level, but got: " + scopeSet);
"The metrics in expression: " + expression + " must have the same scope level, but got: " + scopeSet + ".");
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@

package org.apache.skywalking.oap.server.core.alarm.provider.expr.rt;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import lombok.Getter;
import lombok.extern.slf4j.Slf4j;
Expand All @@ -28,6 +33,13 @@
import org.apache.skywalking.mqe.rt.type.ExpressionResultType;
import org.apache.skywalking.mqe.rt.type.MQEValue;
import org.apache.skywalking.mqe.rt.type.MQEValues;
import org.apache.skywalking.mqe.rt.type.Metadata;
import org.apache.skywalking.oap.server.core.Const;
import org.apache.skywalking.oap.server.core.query.type.KeyValue;
import org.apache.skywalking.oap.server.core.storage.annotation.Column;
import org.apache.skywalking.oap.server.core.storage.annotation.ValueColumnMetadata;
import org.apache.skywalking.oap.server.library.util.CollectionUtils;
import org.apache.skywalking.oap.server.library.util.StringUtil;

/**
* Used for verify the alarm expression and get the metrics name when read the alarm rules.
Expand All @@ -41,20 +53,64 @@ public class AlarmMQEVerifyVisitor extends MQEVisitorBase {
public ExpressionResult visitMetric(MQEParser.MetricContext ctx) {
ExpressionResult result = new ExpressionResult();
String metricName = ctx.metricName().getText();
Optional<ValueColumnMetadata.ValueColumn> valueColumn = ValueColumnMetadata.INSTANCE.readValueColumnDefinition(
metricName);
if (valueColumn.isEmpty()) {
result.setType(ExpressionResultType.UNKNOWN);
result.setError("Metric: [" + metricName + "] dose not exist.");
return result;
}

this.includeMetrics.add(metricName);

if (ctx.parent instanceof MQEParser.TopNOPContext) {
result.setType(ExpressionResultType.UNKNOWN);
result.setError("Unsupported operation: [top_n] in alarm expression.");
return result;
}
Column.ValueDataType dataType = valueColumn.get().getDataType();

MQEValues mqeValues = new MQEValues();
MQEValues mockMqeValues = new MQEValues();
MQEValue mqeValue = new MQEValue();
mqeValue.setEmptyValue(true);
mqeValues.getValues().add(mqeValue);
result.getResults().add(mqeValues);
result.setType(ExpressionResultType.TIME_SERIES_VALUES);
return result;
mockMqeValues.getValues().add(mqeValue);
if (dataType == Column.ValueDataType.COMMON_VALUE) {
result.getResults().add(mockMqeValues);
result.setType(ExpressionResultType.TIME_SERIES_VALUES);
return result;
} else if (dataType == Column.ValueDataType.LABELED_VALUE) {
List<String> labelValues = Collections.emptyList();
if (ctx.label() != null) {
String labelValue = ctx.label().labelValue().getText();
String labelValueTrim = labelValue.substring(1, labelValue.length() - 1);
if (StringUtil.isNotBlank(labelValueTrim)) {
labelValues = Arrays.asList(labelValueTrim.split(Const.COMMA));
}
}
ArrayList<MQEValues> mqeValuesList = new ArrayList<>();
if (CollectionUtils.isEmpty(labelValues)) {
KeyValue label = new KeyValue(GENERAL_LABEL_NAME, GENERAL_LABEL_NAME);
Metadata metadata = new Metadata();
metadata.getLabels().add(label);
mockMqeValues.setMetric(metadata);
mqeValuesList.add(mockMqeValues);
} else {
for (String value : labelValues) {
Metadata metadata = new Metadata();
KeyValue label = new KeyValue(GENERAL_LABEL_NAME, value);
metadata.getLabels().add(label);
mockMqeValues.setMetric(metadata);
mqeValuesList.add(mockMqeValues);
}
}
result.setType(ExpressionResultType.TIME_SERIES_VALUES);
result.setResults(mqeValuesList);
result.setLabeledResult(true);
return result;
} else {
result.setType(ExpressionResultType.UNKNOWN);
result.setError("Metric dose not supported in alarm, metric: [" + metricName + "] is not a common or labeled metric.");
return result;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ public void setUp() throws NoSuchFieldException, IllegalAccessException {
"endpoint_percent", "testColumn", Column.ValueDataType.COMMON_VALUE, Function.Avg, 0,
Scope.Endpoint.getScopeId()
);
ValueColumnMetadata.INSTANCE.putIfAbsent(
"meter_status_code", "testColumn", Column.ValueDataType.LABELED_VALUE, Function.Avg, 0,
Scope.Service.getScopeId()
);
ValueColumnMetadata.INSTANCE.putIfAbsent(
"record", "testColumn", Column.ValueDataType.SAMPLED_RECORD, Function.Avg, 0,
Scope.Endpoint.getScopeId()
Expand All @@ -58,17 +62,23 @@ public void setUp() throws NoSuchFieldException, IllegalAccessException {
@Test
public void testExpressionVerify() throws IllegalExpressionException {
AlarmRule rule = new AlarmRule();
//normal
//normal common metric
rule.setExpression("sum(service_percent < 85) >= 3");

//normal labeled metric
//4xx + 5xx > 10
rule.setExpression("sum(aggregate_labels(meter_status_code{_='4xx,5xx'},sum) > 10) > 3");
rule.setExpression("sum(aggregate_labels(meter_status_code,sum) > 10) > 3");
//4xx or 5xx > 10
rule.setExpression("sum(meter_status_code{_='4xx,5xx'} > 10) >= 3");
rule.setExpression("sum(meter_status_code > 10) >= 3");
//illegal expression
Assertions.assertThrows(IllegalExpressionException.class, () -> {
rule.setExpression("what? sum(service_percent < 85) >= 3");
});

//not exist metric
Assertions.assertEquals(
"Metric: [service_percent111] dose not exist.",
"Expression: sum(service_percent111 < 85) >= 3 error: Metric: [service_percent111] dose not exist.",
Assertions.assertThrows(IllegalExpressionException.class, () -> {
rule.setExpression("sum(service_percent111 < 85) >= 3");
}).getMessage()
Expand All @@ -92,7 +102,7 @@ public void testExpressionVerify() throws IllegalExpressionException {

//not a common or labeled metric
Assertions.assertEquals(
"Metric dose not supported in alarm, metric: [record] is not a common or labeled metric.",
"Expression: sum(record < 85) > 1 error: Metric dose not supported in alarm, metric: [record] is not a common or labeled metric.",
Assertions.assertThrows(IllegalExpressionException.class, () -> {
rule.setExpression("sum(record < 85) > 1");
}).getMessage()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@

package org.apache.skywalking.oap.server.core.profiling.ebpf.storage;

import com.google.common.base.Charsets;
import com.google.common.hash.Hashing;
import com.linecorp.armeria.internal.shaded.guava.base.Charsets;
import org.apache.skywalking.oap.server.core.analysis.SourceDispatcher;
import org.apache.skywalking.oap.server.core.analysis.TimeBucket;
import org.apache.skywalking.oap.server.core.analysis.worker.MetricsStreamProcessor;
Expand All @@ -34,9 +34,15 @@ public void dispatch(EBPFProcessProfilingSchedule source) {
traffic.setStartTime(source.getStartTime());
traffic.setEndTime(source.getCurrentTime());
traffic.setTimeBucket(TimeBucket.getMinuteTimeBucket(source.getCurrentTime()));
traffic.setScheduleId(Hashing.sha256().newHasher()
.putString(String.format("%s_%s_%d", source.getTaskId(), source.getProcessId(), source.getStartTime()), Charsets.UTF_8)
.hash().toString());
traffic.setScheduleId(
Hashing
.sha256()
.newHasher()
.putString(
String.format("%s_%s_%d", source.getTaskId(), source.getProcessId(), source.getStartTime()),
Charsets.UTF_8
)
.hash().toString());
MetricsStreamProcessor.getInstance().in(traffic);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@

package org.apache.skywalking.oap.server.core.profiling.ebpf.storage;

import com.google.common.base.Charsets;
import com.google.common.hash.Hashing;
import com.linecorp.armeria.internal.shaded.guava.base.Charsets;
import lombok.Data;
import org.apache.skywalking.oap.server.core.analysis.Stream;
import org.apache.skywalking.oap.server.core.analysis.record.Record;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@

package org.apache.skywalking.oap.server.core.source;

import com.google.common.base.Charsets;
import com.google.common.hash.Hashing;
import com.linecorp.armeria.internal.shaded.guava.base.Charsets;
import lombok.Getter;
import lombok.Setter;

Expand All @@ -41,8 +41,8 @@ public int scope() {
public String getEntityId() {
if (entityId == null) {
entityId = Hashing.sha256().newHasher()
.putString(String.format("%s_%s_%d", taskId, processId, startTime), Charsets.UTF_8)
.hash().toString();
.putString(String.format("%s_%s_%d", taskId, processId, startTime), Charsets.UTF_8)
.hash().toString();
}
return entityId;
}
Expand Down
11 changes: 7 additions & 4 deletions test/e2e-v2/cases/alarm/alarm-settings.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,15 @@ rules:
receivers: lisi
hooks:
- webhook.custom
# service sla > 1%
service_sla_rule:
expression: sum(service_sla > 100) >= 1
# service_percentile > 10ms
service_percentile_rule:
expression: sum(service_percentile{_='0,1,2,3,4'} > 10) >= 3
period: 10
silence-period: 1
message: Successful rate of service {name} is more than 1% in 1 minutes of last 10 minutes
message: Percentile response time of service {name} alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10.
tags:
level: WARNING
receivers: lisi
hooks:
- webhook.none
comp_rule:
Expand Down
48 changes: 48 additions & 0 deletions test/e2e-v2/cases/alarm/expected/silence-after-graphql-warn.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,30 @@

msgs:
{{- contains .msgs }}
- starttime: {{ gt .starttime 0 }}
scope: Service
id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1
message: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10.
tags:
- key: level
value: WARNING
- key: receivers
value: lisi
events:
{{- contains .events }}
- uuid: {{ notEmpty .uuid }}
source:
service: e2e-service-provider
serviceinstance: ""
endpoint: ""
name: Alarm
type: ""
message: {{ notEmpty .message }}
parameters: []
starttime: {{ gt .starttime 0 }}
endtime: {{ gt .endtime 0 }}
layer: GENERAL
{{- end }}
- starttime: {{ gt .starttime 0 }}
scope: Service
id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1
Expand All @@ -39,6 +63,30 @@ msgs:
endtime: {{ gt .endtime 0 }}
layer: GENERAL
{{- end }}
- starttime: {{ gt .starttime 0 }}
scope: Service
id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1
message: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10.
tags:
- key: level
value: WARNING
- key: receivers
value: lisi
events:
{{- contains .events }}
- uuid: {{ notEmpty .uuid }}
source:
service: e2e-service-provider
serviceinstance: ""
endpoint: ""
name: Alarm
type: ""
message: {{ notEmpty .message }}
parameters: []
starttime: {{ gt .starttime 0 }}
endtime: {{ gt .endtime 0 }}
layer: GENERAL
{{- end }}
- starttime: {{ gt .starttime 0 }}
scope: Service
id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1
Expand Down
24 changes: 24 additions & 0 deletions test/e2e-v2/cases/alarm/expected/silence-before-graphql-warn.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,30 @@

msgs:
{{- contains .msgs }}
- starttime: {{ gt .starttime 0 }}
scope: Service
id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1
message: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10.
tags:
- key: level
value: WARNING
- key: receivers
value: lisi
events:
{{- contains .events }}
- uuid: {{ notEmpty .uuid }}
source:
service: e2e-service-provider
serviceinstance: ""
endpoint: ""
name: Alarm
type: ""
message: {{ notEmpty .message }}
parameters: []
starttime: {{ gt .starttime 0 }}
endtime: {{ gt .endtime 0 }}
layer: GENERAL
{{- end }}
- starttime: {{ gt .starttime 0 }}
scope: Service
id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1
Expand Down

0 comments on commit 07264ef

Please sign in to comment.