Skip to content

Commit

Permalink
break out alert checker.
Browse files Browse the repository at this point in the history
call it from gating passive test.
  • Loading branch information
just-mitch committed Nov 26, 2024
1 parent b51fc43 commit dcd6af0
Show file tree
Hide file tree
Showing 7 changed files with 178 additions and 83 deletions.
2 changes: 1 addition & 1 deletion yarn-project/end-to-end/scripts/e2e_test_with_alerts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,4 @@ docker run \


echo "Running alert checker..."
docker run --network host --rm aztecprotocol/end-to-end:$AZTEC_DOCKER_TAG quality_of_service/alert_checker.test.ts
docker run -e ALERTS_FILE="alerts.yaml" --network host --rm aztecprotocol/end-to-end:$AZTEC_DOCKER_TAG quality_of_service/alert_checker.test.ts
10 changes: 8 additions & 2 deletions yarn-project/end-to-end/scripts/network_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -139,12 +139,15 @@ helm upgrade --install spartan "$REPO/spartan/aztec-network/" \

kubectl wait pod -l app==pxe --for=condition=Ready -n "$NAMESPACE" --timeout=10m

# Find two free ports between 9000 and 10000
FREE_PORTS=$(comm -23 <(seq 9000 10000 | sort) <(ss -Htan | awk '{print $4}' | cut -d':' -f2 | sort -u) | shuf | head -n 2)
# Find 3 free ports between 9000 and 10000
FREE_PORTS=$(comm -23 <(seq 9000 10000 | sort) <(ss -Htan | awk '{print $4}' | cut -d':' -f2 | sort -u) | shuf | head -n 3)

# Extract the two free ports from the list
PXE_PORT=$(echo $FREE_PORTS | awk '{print $1}')
ANVIL_PORT=$(echo $FREE_PORTS | awk '{print $2}')
METRICS_PORT=$(echo $FREE_PORTS | awk '{print $3}')

GRAFANA_PASSWORD=$(kubectl get secrets -n metrics metrics-grafana -o jsonpath='{.data.admin-password}' | base64 --decode)

# Namespace variable (assuming it's set)
NAMESPACE=${NAMESPACE:-default}
Expand All @@ -170,6 +173,9 @@ docker run --rm --network=host \
-e CONTAINER_PXE_PORT=8081 \
-e HOST_ETHEREUM_PORT=$ANVIL_PORT \
-e CONTAINER_ETHEREUM_PORT=8545 \
-e HOST_METRICS_PORT=$METRICS_PORT \
-e CONTAINER_METRICS_PORT=80 \
-e GRAFANA_PASSWORD=$GRAFANA_PASSWORD \
-e DEBUG="aztec:*" \
-e LOG_JSON=1 \
-e LOG_LEVEL=debug \
Expand Down
Original file line number Diff line number Diff line change
@@ -1,88 +1,18 @@
import { type DebugLogger, createDebugLogger } from '@aztec/aztec.js';
import { fileURLToPath } from '@aztec/foundation/url';
import { createDebugLogger } from '@aztec/aztec.js';

import * as fs from 'fs';
import * as yaml from 'js-yaml';
import { dirname, join } from 'path';
import { AlertChecker } from './alert_checker.js';

const GRAFANA_ENDPOINT = 'http://localhost:3000/api/datasources/proxy/uid/prometheus/api/v1/query';
interface AlertConfig {
alert: string;
expr: string;
for: string;
labels: Record<string, string>;
annotations: Record<string, string>;
}
// Define __dirname for ES modules
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);

// Load YAML configuration
function loadAlertsConfig(filePath: string): AlertConfig[] {
const fileContents = fs.readFileSync(join(__dirname, filePath), 'utf8');
const data = yaml.load(fileContents) as { alerts: AlertConfig[] };
return data.alerts;
}

// Function to query Grafana based on an expression
async function queryGrafana(expr: string): Promise<number> {
// Create base64 encoded credentials for basic auth
const credentials = Buffer.from('admin:admin').toString('base64');
const ALERTS_FILE = process.env.ALERTS_FILE;

const response = await fetch(`${GRAFANA_ENDPOINT}?query=${encodeURIComponent(expr)}`, {
headers: {
Authorization: `Basic ${credentials}`,
},
});

if (!response.ok) {
throw new Error(`Failed to fetch data from Grafana: ${response.statusText}`);
}

const data = await response.json();
const result = data.data.result;
return result.length > 0 ? parseFloat(result[0].value[1]) : 0;
if (!ALERTS_FILE) {
throw new Error('ALERTS_FILE is not set');
}

// Function to check alerts based on expressions
async function checkAlerts(alerts: AlertConfig[], logger: DebugLogger) {
let alertTriggered = false;

for (const alert of alerts) {
logger.info(`Checking alert: ${JSON.stringify(alert)}`);

const metricValue = await queryGrafana(alert.expr);
logger.info(`Metric value: ${metricValue}`);
if (metricValue > 0) {
logger.error(`Alert ${alert.alert} triggered! Value: ${metricValue}`);
alertTriggered = true;
} else {
logger.info(`Alert ${alert.alert} passed.`);
}
}

// If any alerts have been triggered we fail the test
if (alertTriggered) {
throw new Error('Test failed due to triggered alert');
}
}

// Main function to run tests
async function runAlertChecker(logger: DebugLogger) {
const alerts = loadAlertsConfig('alerts.yaml');
try {
await checkAlerts(alerts, logger);
logger.info('All alerts passed.');
} catch (error) {
logger.error(error instanceof Error ? error.message : String(error));
process.exit(1); // Exit with error code
}
}

// Running as a jest test to use existing end to end test framework
describe('Alert Checker', () => {
const logger = createDebugLogger('aztec:alert-checker');
const alertChecker = new AlertChecker(logger);

it('should check alerts', async () => {
await runAlertChecker(logger);
await alertChecker.runAlertCheckFromFilePath(ALERTS_FILE);
});
});
96 changes: 96 additions & 0 deletions yarn-project/end-to-end/src/quality_of_service/alert_checker.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import { type DebugLogger } from '@aztec/aztec.js';
import { fileURLToPath } from '@aztec/foundation/url';

import * as fs from 'fs';
import * as yaml from 'js-yaml';
import { dirname, join } from 'path';

export interface AlertConfig {
alert: string;
expr: string;
for: string;
labels: Record<string, string>;
annotations: Record<string, string>;
}

export interface AlertCheckerConfig {
grafanaEndpoint: string;
grafanaCredentials: string;
}

const DEFAULT_CONFIG: AlertCheckerConfig = {
grafanaEndpoint: 'http://localhost:3000/api/datasources/proxy/uid/prometheus/api/v1/query',
grafanaCredentials: 'admin:admin',
};

export class AlertChecker {
private config: AlertCheckerConfig;
private logger: DebugLogger;

constructor(logger: DebugLogger, config: Partial<AlertCheckerConfig> = {}) {
this.config = { ...DEFAULT_CONFIG, ...config };
this.logger = logger;
}

private loadAlertsConfig(filePath: string): AlertConfig[] {
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
const fileContents = fs.readFileSync(join(__dirname, filePath), 'utf8');
const data = yaml.load(fileContents) as { alerts: AlertConfig[] };
return data.alerts;
}

private async queryGrafana(expr: string): Promise<number> {
const credentials = Buffer.from(this.config.grafanaCredentials).toString('base64');

const response = await fetch(`${this.config.grafanaEndpoint}?query=${encodeURIComponent(expr)}`, {
headers: {
Authorization: `Basic ${credentials}`,
},
});

if (!response.ok) {
throw new Error(`Failed to fetch data from Grafana: ${response.statusText}`);
}

const data = await response.json();
const result = data.data.result;
return result.length > 0 ? parseFloat(result[0].value[1]) : 0;
}

private async checkAlerts(alerts: AlertConfig[]): Promise<void> {
let alertTriggered = false;

for (const alert of alerts) {
this.logger.info(`Checking alert: ${JSON.stringify(alert)}`);

const metricValue = await this.queryGrafana(alert.expr);
this.logger.info(`Metric value: ${metricValue}`);
if (metricValue > 0) {
this.logger.error(`Alert ${alert.alert} triggered! Value: ${metricValue}`);
alertTriggered = true;
} else {
this.logger.info(`Alert ${alert.alert} passed.`);
}
}

if (alertTriggered) {
throw new Error('Test failed due to triggered alert');
}
}

public async runAlertCheck(alerts: AlertConfig[]): Promise<void> {
try {
await this.checkAlerts(alerts);
this.logger.info('All alerts passed.');
} catch (error) {
this.logger.error(error instanceof Error ? error.message : String(error));
throw error;
}
}

public async runAlertCheckFromFilePath(filePath: string): Promise<void> {
const alerts = this.loadAlertsConfig(filePath);
await this.checkAlerts(alerts);
}
}
23 changes: 23 additions & 0 deletions yarn-project/end-to-end/src/spartan/gating-passive.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { createDebugLogger } from '@aztec/foundation/log';
import { expect, jest } from '@jest/globals';

import { RollupCheatCodes } from '../../../aztec.js/src/utils/cheat_codes.js';
import { type AlertConfig } from '../quality_of_service/alert_checker.js';
import {
applyBootNodeFailure,
applyNetworkShaping,
Expand All @@ -13,9 +14,20 @@ import {
getConfig,
isK8sConfig,
restartBot,
runAlertCheck,
startPortForward,
} from './utils.js';

const qosAlerts: AlertConfig[] = [
{
alert: 'SequencerTimeToCollectAttestations',
expr: 'avg_over_time(aztec_sequencer_time_to_collect_attestations[2m]) > 2500',
labels: { severity: 'error' },
for: '10m',
annotations: {},
},
];

const config = getConfig(process.env);
if (!isK8sConfig(config)) {
throw new Error('This test must be run in a k8s environment');
Expand All @@ -39,6 +51,10 @@ describe('a test that passively observes the network in the presence of network
// 50% is the max that we expect to miss
const MAX_MISSED_SLOT_PERCENT = 0.5;

afterAll(async () => {
await runAlertCheck(config, qosAlerts, debugLogger);
});

it('survives network chaos', async () => {
await startPortForward({
resource: `svc/${config.INSTANCE_NAME}-aztec-network-pxe`,
Expand All @@ -52,6 +68,13 @@ describe('a test that passively observes the network in the presence of network
containerPort: CONTAINER_ETHEREUM_PORT,
hostPort: HOST_ETHEREUM_PORT,
});

await startPortForward({
resource: `svc/metrics-grafana`,
namespace: 'metrics',
containerPort: config.CONTAINER_METRICS_PORT,
hostPort: config.HOST_METRICS_PORT,
});
const client = await createCompatibleClient(PXE_URL, debugLogger);
const ethCheatCodes = new EthCheatCodes(ETHEREUM_HOST);
const rollupCheatCodes = new RollupCheatCodes(
Expand Down
27 changes: 25 additions & 2 deletions yarn-project/end-to-end/src/spartan/smoke.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,23 @@ import { RollupAbi } from '@aztec/l1-artifacts';
import { createPublicClient, getAddress, getContract, http } from 'viem';
import { foundry } from 'viem/chains';

import { getConfig, isK8sConfig, startPortForward } from './utils.js';
import { type AlertConfig } from '../quality_of_service/alert_checker.js';
import { getConfig, isK8sConfig, runAlertCheck, startPortForward } from './utils.js';

const config = getConfig(process.env);

const debugLogger = createDebugLogger('aztec:spartan-test:smoke');
// const userLog = createConsoleLogger();

// QoS alerts for when we are running in k8s
const qosAlerts: AlertConfig[] = [
{
alert: 'SequencerTimeToCollectAttestations',
expr: 'avg_over_time(aztec_sequencer_time_to_collect_attestations[2m]) > 2500',
labels: { severity: 'error' },
for: '10m',
annotations: {},
},
];

describe('smoke test', () => {
let pxe: PXE;
Expand All @@ -24,11 +35,23 @@ describe('smoke test', () => {
hostPort: config.HOST_PXE_PORT,
});
PXE_URL = `http://127.0.0.1:${config.HOST_PXE_PORT}`;

await startPortForward({
resource: `svc/metrics-grafana`,
namespace: 'metrics',
containerPort: config.CONTAINER_METRICS_PORT,
hostPort: config.HOST_METRICS_PORT,
});
} else {
PXE_URL = config.PXE_URL;
}
pxe = await createCompatibleClient(PXE_URL, debugLogger);
});

afterAll(async () => {
await runAlertCheck(config, qosAlerts, debugLogger);
});

it('should be able to get node enr', async () => {
const info = await pxe.getNodeInfo();
expect(info).toBeDefined();
Expand Down
17 changes: 17 additions & 0 deletions yarn-project/end-to-end/src/spartan/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import { promisify } from 'util';
import { z } from 'zod';

import type { RollupCheatCodes } from '../../../aztec.js/src/utils/cheat_codes.js';
import { AlertChecker, type AlertConfig } from '../quality_of_service/alert_checker.js';

const execAsync = promisify(exec);

Expand All @@ -19,6 +20,10 @@ const k8sConfigSchema = z.object({
CONTAINER_PXE_PORT: z.coerce.number().default(8080),
HOST_ETHEREUM_PORT: z.coerce.number().min(1, 'HOST_ETHEREUM_PORT env variable must be set'),
CONTAINER_ETHEREUM_PORT: z.coerce.number().default(8545),
HOST_METRICS_PORT: z.coerce.number().min(1, 'HOST_METRICS_PORT env variable must be set'),
CONTAINER_METRICS_PORT: z.coerce.number().default(80),
GRAFANA_PASSWORD: z.string().min(1, 'GRAFANA_PASSWORD env variable must be set'),
METRICS_API_PATH: z.string().default('/api/datasources/proxy/uid/spartan-metrics-prometheus/api/v1/query'),
SPARTAN_DIR: z.string().min(1, 'SPARTAN_DIR env variable must be set'),
K8S: z.literal('true'),
});
Expand Down Expand Up @@ -382,3 +387,15 @@ export async function enableValidatorDynamicBootNode(

logger.info(`Validator dynamic boot node enabled`);
}

export async function runAlertCheck(config: EnvConfig, alerts: AlertConfig[], logger: Logger) {
if (isK8sConfig(config)) {
const alertChecker = new AlertChecker(logger, {
grafanaEndpoint: `http://localhost:${config.HOST_METRICS_PORT}${config.METRICS_API_PATH}`,
grafanaCredentials: `admin:${config.GRAFANA_PASSWORD}`,
});
await alertChecker.runAlertCheck(alerts);
} else {
logger.info('Not running alert check in non-k8s environment');
}
}

0 comments on commit dcd6af0

Please sign in to comment.