Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: assert metrics in network tests #10215

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions yarn-project/end-to-end/scripts/e2e_test_with_alerts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,8 @@ docker run \
-e METRICS_PORT="4318" \
-e COLLECT_METRICS="true" \
-e PULL_REQUEST="$PULL_REQUEST" \
-e CHECK_ALERTS="true" \
$env_args \
--rm aztecprotocol/end-to-end:$AZTEC_DOCKER_TAG \
"$test_path" "$@" || [ "$ignore_failures" = "true" ]


echo "Running alert checker..."
docker run --network host --rm aztecprotocol/end-to-end:$AZTEC_DOCKER_TAG quality_of_service/alert_checker.test.ts
10 changes: 8 additions & 2 deletions yarn-project/end-to-end/scripts/network_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -139,12 +139,15 @@ helm upgrade --install spartan "$REPO/spartan/aztec-network/" \

kubectl wait pod -l app==pxe --for=condition=Ready -n "$NAMESPACE" --timeout=10m

# Find two free ports between 9000 and 10000
FREE_PORTS=$(comm -23 <(seq 9000 10000 | sort) <(ss -Htan | awk '{print $4}' | cut -d':' -f2 | sort -u) | shuf | head -n 2)
# Find 3 free ports between 9000 and 10000
FREE_PORTS=$(comm -23 <(seq 9000 10000 | sort) <(ss -Htan | awk '{print $4}' | cut -d':' -f2 | sort -u) | shuf | head -n 3)

# Extract the two free ports from the list
PXE_PORT=$(echo $FREE_PORTS | awk '{print $1}')
ANVIL_PORT=$(echo $FREE_PORTS | awk '{print $2}')
METRICS_PORT=$(echo $FREE_PORTS | awk '{print $3}')

GRAFANA_PASSWORD=$(kubectl get secrets -n metrics metrics-grafana -o jsonpath='{.data.admin-password}' | base64 --decode)

# Namespace variable (assuming it's set)
NAMESPACE=${NAMESPACE:-default}
Expand All @@ -170,6 +173,9 @@ docker run --rm --network=host \
-e CONTAINER_PXE_PORT=8081 \
-e HOST_ETHEREUM_PORT=$ANVIL_PORT \
-e CONTAINER_ETHEREUM_PORT=8545 \
-e HOST_METRICS_PORT=$METRICS_PORT \
-e CONTAINER_METRICS_PORT=80 \
-e GRAFANA_PASSWORD=$GRAFANA_PASSWORD \
-e DEBUG="aztec:*" \
-e LOG_JSON=1 \
-e LOG_LEVEL=debug \
Expand Down
20 changes: 20 additions & 0 deletions yarn-project/end-to-end/src/e2e_p2p/gossip_network.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,29 @@ import fs from 'fs';

import { shouldCollectMetrics } from '../fixtures/fixtures.js';
import { type NodeContext, createNodes } from '../fixtures/setup_p2p_test.js';
import { AlertChecker, type AlertConfig } from '../quality_of_service/alert_checker.js';
import { P2PNetworkTest, WAIT_FOR_TX_TIMEOUT } from './p2p_network.js';
import { createPXEServiceAndSubmitTransactions } from './shared.js';

const CHECK_ALERTS = process.env.CHECK_ALERTS === 'true';

// Don't set this to a higher value than 9 because each node will use a different L1 publisher account and anvil seeds
const NUM_NODES = 4;
const NUM_TXS_PER_NODE = 2;
const BOOT_NODE_UDP_PORT = 40600;

const DATA_DIR = './data/gossip';

const qosAlerts: AlertConfig[] = [
{
alert: 'SequencerTimeToCollectAttestations',
expr: 'aztec_sequencer_time_to_collect_attestations > 2500',
labels: { severity: 'error' },
for: '10m',
annotations: {},
},
];

describe('e2e_p2p_network', () => {
let t: P2PNetworkTest;
let nodes: AztecNodeService[];
Expand All @@ -39,6 +52,13 @@ describe('e2e_p2p_network', () => {
}
});

afterAll(async () => {
if (CHECK_ALERTS) {
const checker = new AlertChecker(t.logger);
await checker.runAlertCheck(qosAlerts);
}
});

it('should rollup txs from all peers', async () => {
// create the bootstrap node for the network
if (!t.bootstrapNodeEnr) {
Expand Down

This file was deleted.

105 changes: 105 additions & 0 deletions yarn-project/end-to-end/src/quality_of_service/alert_checker.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import { type DebugLogger } from '@aztec/aztec.js';

import * as fs from 'fs';
import * as yaml from 'js-yaml';

export interface AlertConfig {
alert: string;
expr: string;
for: string;
labels: Record<string, string>;
annotations: Record<string, string>;
}

export interface AlertCheckerConfig {
grafanaEndpoint: string;
grafanaCredentials: string;
}

// This config is good if you're running the otel-lgtm stack locally
const DEFAULT_CONFIG: AlertCheckerConfig = {
grafanaEndpoint: 'http://localhost:3000/api/datasources/proxy/uid/prometheus/api/v1/query',
grafanaCredentials: 'admin:admin',
};

export class AlertChecker {
private config: AlertCheckerConfig;
private logger: DebugLogger;

constructor(logger: DebugLogger, config: Partial<AlertCheckerConfig> = {}) {
this.config = { ...DEFAULT_CONFIG, ...config };
this.logger = logger;
}

/**
* Load the alerts config from a file path.
* @param filePath - The absolute path to the alerts file.
*/
private loadAlertsConfig(filePath: string): AlertConfig[] {
const fileContents = fs.readFileSync(filePath, 'utf8');
const data = yaml.load(fileContents) as { alerts: AlertConfig[] };
return data.alerts;
}

private async queryGrafana(expr: string): Promise<number> {
const credentials = Buffer.from(this.config.grafanaCredentials).toString('base64');

const response = await fetch(`${this.config.grafanaEndpoint}?query=${encodeURIComponent(expr)}`, {
headers: {
Authorization: `Basic ${credentials}`,
},
});

if (!response.ok) {
throw new Error(`Failed to fetch data from Grafana: ${response.statusText}`);
}

const data = await response.json();
const result = data.data.result;
return result.length > 0 ? parseFloat(result[0].value[1]) : 0;
}

private async checkAlerts(alerts: AlertConfig[]): Promise<void> {
let alertTriggered = false;

for (const alert of alerts) {
this.logger.info(`Checking alert: ${JSON.stringify(alert)}`);

const metricValue = await this.queryGrafana(alert.expr);
this.logger.info(`Metric value: ${metricValue}`);
if (metricValue > 0) {
this.logger.error(`Alert ${alert.alert} triggered! Value: ${metricValue}`);
alertTriggered = true;
} else {
this.logger.info(`Alert ${alert.alert} passed.`);
}
}

if (alertTriggered) {
throw new Error('Test failed due to triggered alert');
}
}

/**
* Run the alert check based on the alerts defined in an array.
* @param alerts - The alerts to check.
*/
public async runAlertCheck(alerts: AlertConfig[]): Promise<void> {
try {
await this.checkAlerts(alerts);
this.logger.info('All alerts passed.');
} catch (error) {
this.logger.error(error instanceof Error ? error.message : String(error));
throw error;
}
}

/**
* Run the alert check based on the alerts defined in a yaml file.
* @param filePath - The absolute path to the alerts file.
*/
public async runAlertCheckFromFilePath(filePath: string): Promise<void> {
const alerts = this.loadAlertsConfig(filePath);
await this.checkAlerts(alerts);
}
}
10 changes: 0 additions & 10 deletions yarn-project/end-to-end/src/quality_of_service/alerts.yaml

This file was deleted.

23 changes: 23 additions & 0 deletions yarn-project/end-to-end/src/spartan/gating-passive.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { createDebugLogger } from '@aztec/foundation/log';
import { expect, jest } from '@jest/globals';

import { RollupCheatCodes } from '../../../aztec.js/src/utils/cheat_codes.js';
import { type AlertConfig } from '../quality_of_service/alert_checker.js';
import {
applyBootNodeFailure,
applyNetworkShaping,
Expand All @@ -13,9 +14,20 @@ import {
getConfig,
isK8sConfig,
restartBot,
runAlertCheck,
startPortForward,
} from './utils.js';

const qosAlerts: AlertConfig[] = [
{
alert: 'SequencerTimeToCollectAttestations',
expr: 'avg_over_time(aztec_sequencer_time_to_collect_attestations[2m]) > 2500',
labels: { severity: 'error' },
for: '10m',
annotations: {},
},
];

const config = getConfig(process.env);
if (!isK8sConfig(config)) {
throw new Error('This test must be run in a k8s environment');
Expand All @@ -39,6 +51,10 @@ describe('a test that passively observes the network in the presence of network
// 50% is the max that we expect to miss
const MAX_MISSED_SLOT_PERCENT = 0.5;

afterAll(async () => {
await runAlertCheck(config, qosAlerts, debugLogger);
});

it('survives network chaos', async () => {
await startPortForward({
resource: `svc/${config.INSTANCE_NAME}-aztec-network-pxe`,
Expand All @@ -52,6 +68,13 @@ describe('a test that passively observes the network in the presence of network
containerPort: CONTAINER_ETHEREUM_PORT,
hostPort: HOST_ETHEREUM_PORT,
});

await startPortForward({
resource: `svc/metrics-grafana`,
namespace: 'metrics',
containerPort: config.CONTAINER_METRICS_PORT,
hostPort: config.HOST_METRICS_PORT,
});
const client = await createCompatibleClient(PXE_URL, debugLogger);
const ethCheatCodes = new EthCheatCodes(ETHEREUM_HOST);
const rollupCheatCodes = new RollupCheatCodes(
Expand Down
27 changes: 25 additions & 2 deletions yarn-project/end-to-end/src/spartan/smoke.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,23 @@ import { RollupAbi } from '@aztec/l1-artifacts';
import { createPublicClient, getAddress, getContract, http } from 'viem';
import { foundry } from 'viem/chains';

import { getConfig, isK8sConfig, startPortForward } from './utils.js';
import { type AlertConfig } from '../quality_of_service/alert_checker.js';
import { getConfig, isK8sConfig, runAlertCheck, startPortForward } from './utils.js';

const config = getConfig(process.env);

const debugLogger = createDebugLogger('aztec:spartan-test:smoke');
// const userLog = createConsoleLogger();

// QoS alerts for when we are running in k8s
const qosAlerts: AlertConfig[] = [
{
alert: 'SequencerTimeToCollectAttestations',
expr: 'avg_over_time(aztec_sequencer_time_to_collect_attestations[2m]) > 2500',
labels: { severity: 'error' },
for: '10m',
annotations: {},
},
];

describe('smoke test', () => {
let pxe: PXE;
Expand All @@ -24,11 +35,23 @@ describe('smoke test', () => {
hostPort: config.HOST_PXE_PORT,
});
PXE_URL = `http://127.0.0.1:${config.HOST_PXE_PORT}`;

await startPortForward({
resource: `svc/metrics-grafana`,
namespace: 'metrics',
containerPort: config.CONTAINER_METRICS_PORT,
hostPort: config.HOST_METRICS_PORT,
});
} else {
PXE_URL = config.PXE_URL;
}
pxe = await createCompatibleClient(PXE_URL, debugLogger);
});

afterAll(async () => {
await runAlertCheck(config, qosAlerts, debugLogger);
});

it('should be able to get node enr', async () => {
const info = await pxe.getNodeInfo();
expect(info).toBeDefined();
Expand Down
Loading