Skip to content

Commit

Permalink
feat: e2e metrics reporting (#9776)
Browse files Browse the repository at this point in the history
  • Loading branch information
Maddiaa0 authored Nov 25, 2024
1 parent 233b387 commit 9cab121
Show file tree
Hide file tree
Showing 16 changed files with 216 additions and 7 deletions.
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ jobs:
uses: ./.github/ensure-tester-with-images
env:
USERNAME: ${{ needs.configure.outputs.username }}
PULL_REQUEST: ${{ github.event.pull_request.number }}
with:
runner_type: ${{ steps.runner_type.outputs.type }}
builder_type: builder-x86
Expand Down
3 changes: 3 additions & 0 deletions yarn-project/end-to-end/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
"formatting": "run -T prettier --check ./src \"!src/web/main.js\" && run -T eslint ./src",
"formatting:fix": "run -T eslint --fix ./src && run -T prettier -w ./src",
"test": "LOG_LEVEL=${LOG_LEVEL:-verbose} DEBUG_COLORS=1 NODE_NO_WARNINGS=1 node --experimental-vm-modules ../node_modules/.bin/jest --testTimeout=300000 --forceExit",
"test:with-alerts": "./scripts/test-with-alerts.sh",
"test:profile": "LOG_LEVEL=${LOG_LEVEL:-verbose} DEBUG_COLORS=1 NODE_NO_WARNINGS=1 0x --output-dir \"flame_graph/{pid}.0x\" -- node --experimental-vm-modules ../node_modules/jest/bin/jest.js --runInBand --testTimeout=300000 --forceExit",
"serve:flames": "python3 -m http.server --directory \"flame_graph\" 8000",
"test:debug": "LOG_LEVEL=${LOG_LEVEL:-verbose} DEBUG_COLORS=1 NODE_NO_WARNINGS=1 node --inspect --experimental-vm-modules ../node_modules/.bin/jest --testTimeout=300000 --forceExit",
Expand Down Expand Up @@ -99,10 +100,12 @@
"0x": "^5.7.0",
"@jest/globals": "^29.5.0",
"@types/jest": "^29.5.0",
"@types/js-yaml": "^4.0.9",
"@types/lodash.chunk": "^4.2.9",
"concurrently": "^7.6.0",
"jest": "^29.5.0",
"jest-extended": "^4.0.2",
"js-yaml": "^4.1.0",
"ts-node": "^10.9.1",
"typescript": "^5.0.4"
},
Expand Down
2 changes: 2 additions & 0 deletions yarn-project/end-to-end/scripts/e2e_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ fi
# Check if the test uses docker compose
if [ "$(echo "$test_config" | yq e '.use_compose // false' -)" = "true" ]; then
$(dirname "$0")/e2e_compose_test.sh "$test_path" "$@" || [ "$ignore_failures" = "true" ]
elif [ "$(echo "$test_config" | yq e '.with_alerts // false' -)" = "true" ]; then
$(dirname "$0")/e2e_test_with_alerts.sh "$test_path" "$@" || [ "$ignore_failures" = "true" ]
else
# Set environment variables
while IFS='=' read -r key value; do
Expand Down
2 changes: 1 addition & 1 deletion yarn-project/end-to-end/scripts/e2e_test_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,9 @@ tests:
e2e_token_contract: {}
e2e_p2p_gossip:
test_path: 'e2e_p2p/gossip_network.test.ts'
with_alerts: true
e2e_p2p_upgrade_governance_proposer:
test_path: 'e2e_p2p/upgrade_governance_proposer.test.ts'
# https://github.com/AztecProtocol/aztec-packages/issues/9843
e2e_p2p_rediscovery:
test_path: 'e2e_p2p/rediscovery.test.ts'
e2e_p2p_reqresp:
Expand Down
51 changes: 51 additions & 0 deletions yarn-project/end-to-end/scripts/e2e_test_with_alerts.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#! /bin/bash
## Run an end to end test with alerts

# This will run an end to end test running the otel-lgtm stack (otel-collector, grafana, prometheus, tempo and loki)
# Then check the test against a set of alerts defined in the alerts.yaml file
# Note: these tests must run with METRICS enabled

# Usage: ./e2e_test_with_alerts.sh <test-name> <...extra-args>
# Example: ./e2e_test_with_alerts.sh gossip_network

set -e

test_path=$1

echo "Running otel stack"
CONTAINER_ID=$(docker run -d -p 3000:3000 -p 4317:4317 -p 4318:4318 --rm grafana/otel-lgtm)

trap "docker stop $CONTAINER_ID" EXIT SIGINT SIGTERM

echo "Waiting for LGTM stack to be ready..."
timeout=90
while [ $timeout -gt 0 ]; do
if docker logs $CONTAINER_ID 2>&1 | grep -q "The OpenTelemetry collector and the Grafana LGTM stack are up and running"; then
echo "LGTM stack is ready!"
break
fi
sleep 1
((timeout--))
done

if [ $timeout -eq 0 ]; then
echo "Timeout waiting for LGTM stack to be ready"
docker stop $CONTAINER_ID
exit 1
fi

## Pass through run the existing e2e test
docker run \
--network host \
-e HARDWARE_CONCURRENCY="$HARDWARE_CONCURRENCY" \
-e FAKE_PROOFS="$FAKE_PROOFS" \
-e METRICS_PORT="4318" \
-e COLLECT_METRICS="true" \
-e PULL_REQUEST="$PULL_REQUEST" \
$env_args \
--rm aztecprotocol/end-to-end:$AZTEC_DOCKER_TAG \
"$test_path" "$@" || [ "$ignore_failures" = "true" ]


echo "Running alert checker..."
docker run --network host --rm aztecprotocol/end-to-end:$AZTEC_DOCKER_TAG quality_of_service/alert_checker.test.ts
5 changes: 5 additions & 0 deletions yarn-project/end-to-end/src/e2e_p2p/rediscovery.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { sleep } from '@aztec/aztec.js';

import fs from 'fs';

import { shouldCollectMetrics } from '../fixtures/fixtures.js';
import { type NodeContext, createNode, createNodes } from '../fixtures/setup_p2p_test.js';
import { P2PNetworkTest, WAIT_FOR_TX_TIMEOUT } from './p2p_network.js';
import { createPXEServiceAndSubmitTransactions } from './shared.js';
Expand All @@ -23,6 +24,8 @@ describe('e2e_p2p_rediscovery', () => {
testName: 'e2e_p2p_rediscovery',
numberOfNodes: NUM_NODES,
basePort: BOOT_NODE_UDP_PORT,
// To collect metrics - run in aztec-packages `docker compose --profile metrics up` and set COLLECT_METRICS=true
metricsPort: shouldCollectMetrics(),
});
await t.applyBaseSnapshots();
await t.setup();
Expand All @@ -48,6 +51,8 @@ describe('e2e_p2p_rediscovery', () => {
NUM_NODES,
BOOT_NODE_UDP_PORT,
DATA_DIR,
// To collect metrics - run in aztec-packages `docker compose --profile metrics up`
shouldCollectMetrics(),
);

// wait a bit for peers to discover each other
Expand Down
6 changes: 6 additions & 0 deletions yarn-project/end-to-end/src/e2e_p2p/reex.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import { BlockProposal, SignatureDomainSeperator, getHashedSignaturePayload } fr
import { beforeAll, describe, it, jest } from '@jest/globals';
import fs from 'fs';

import { shouldCollectMetrics } from '../fixtures/fixtures.js';
import { createNodes } from '../fixtures/setup_p2p_test.js';
import { P2PNetworkTest } from './p2p_network.js';
import { submitComplexTxsTo } from './shared.js';
Expand All @@ -28,6 +29,8 @@ describe('e2e_p2p_reex', () => {
testName: 'e2e_p2p_reex',
numberOfNodes: NUM_NODES,
basePort: BOOT_NODE_UDP_PORT,
// To collect metrics - run in aztec-packages `docker compose --profile metrics up` and set COLLECT_METRICS=true
metricsPort: shouldCollectMetrics(),
});

t.logger.verbose('Setup account');
Expand Down Expand Up @@ -66,6 +69,9 @@ describe('e2e_p2p_reex', () => {
t.bootstrapNodeEnr,
NUM_NODES,
BOOT_NODE_UDP_PORT,
DATA_DIR,
// To collect metrics - run in aztec-packages `docker compose --profile metrics up` and set COLLECT_METRICS=true
shouldCollectMetrics(),
);

// Hook into the node and intercept re-execution logic, ensuring that it was infact called
Expand Down
4 changes: 4 additions & 0 deletions yarn-project/end-to-end/src/e2e_p2p/reqresp.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import { jest } from '@jest/globals';
import fs from 'fs';
import { getContract } from 'viem';

import { shouldCollectMetrics } from '../fixtures/fixtures.js';
import { type NodeContext, createNodes } from '../fixtures/setup_p2p_test.js';
import { P2PNetworkTest, WAIT_FOR_TX_TIMEOUT } from './p2p_network.js';
import { createPXEServiceAndSubmitTransactions } from './shared.js';
Expand All @@ -26,6 +27,8 @@ describe('e2e_p2p_reqresp_tx', () => {
testName: 'e2e_p2p_reqresp_tx',
numberOfNodes: NUM_NODES,
basePort: BOOT_NODE_UDP_PORT,
// To collect metrics - run in aztec-packages `docker compose --profile metrics up`
metricsPort: shouldCollectMetrics(),
});
await t.applyBaseSnapshots();
await t.setup();
Expand Down Expand Up @@ -67,6 +70,7 @@ describe('e2e_p2p_reqresp_tx', () => {
NUM_NODES,
BOOT_NODE_UDP_PORT,
DATA_DIR,
shouldCollectMetrics(),
);

// wait a bit for peers to discover each other
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import {
import fs from 'fs';
import { getAddress, getContract } from 'viem';

import { shouldCollectMetrics } from '../fixtures/fixtures.js';
import { createNodes } from '../fixtures/setup_p2p_test.js';
import { P2PNetworkTest } from './p2p_network.js';

Expand All @@ -36,6 +37,8 @@ describe('e2e_p2p_governance_proposer', () => {
testName: 'e2e_p2p_gerousia',
numberOfNodes: NUM_NODES,
basePort: BOOT_NODE_UDP_PORT,
// To collect metrics - run in aztec-packages `docker compose --profile metrics up`
metricsPort: shouldCollectMetrics(),
});
await t.applyBaseSnapshots();
await t.setup();
Expand Down Expand Up @@ -132,6 +135,7 @@ describe('e2e_p2p_governance_proposer', () => {
NUM_NODES,
BOOT_NODE_UDP_PORT,
DATA_DIR,
shouldCollectMetrics(),
);

await sleep(4000);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import { type DebugLogger, createDebugLogger } from '@aztec/aztec.js';
import { fileURLToPath } from '@aztec/foundation/url';

import * as fs from 'fs';
import * as yaml from 'js-yaml';
import { dirname, join } from 'path';

const GRAFANA_ENDPOINT = 'http://localhost:3000/api/datasources/proxy/uid/prometheus/api/v1/query';
interface AlertConfig {
alert: string;
expr: string;
for: string;
labels: Record<string, string>;
annotations: Record<string, string>;
}
// Define __dirname for ES modules
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);

// Load YAML configuration
function loadAlertsConfig(filePath: string): AlertConfig[] {
const fileContents = fs.readFileSync(join(__dirname, filePath), 'utf8');
const data = yaml.load(fileContents) as { alerts: AlertConfig[] };
return data.alerts;
}

// Function to query Grafana based on an expression
async function queryGrafana(expr: string): Promise<number> {
// Create base64 encoded credentials for basic auth
const credentials = Buffer.from('admin:admin').toString('base64');

const response = await fetch(`${GRAFANA_ENDPOINT}?query=${encodeURIComponent(expr)}`, {
headers: {
Authorization: `Basic ${credentials}`,
},
});

if (!response.ok) {
throw new Error(`Failed to fetch data from Grafana: ${response.statusText}`);
}

const data = await response.json();
const result = data.data.result;
return result.length > 0 ? parseFloat(result[0].value[1]) : 0;
}

// Function to check alerts based on expressions
async function checkAlerts(alerts: AlertConfig[], logger: DebugLogger) {
let alertTriggered = false;

for (const alert of alerts) {
logger.info(`Checking alert: ${JSON.stringify(alert)}`);

const metricValue = await queryGrafana(alert.expr);
logger.info(`Metric value: ${metricValue}`);
if (metricValue > 0) {
logger.error(`Alert ${alert.alert} triggered! Value: ${metricValue}`);
alertTriggered = true;
} else {
logger.info(`Alert ${alert.alert} passed.`);
}
}

// If any alerts have been triggered we fail the test
if (alertTriggered) {
throw new Error('Test failed due to triggered alert');
}
}

// Main function to run tests
async function runAlertChecker(logger: DebugLogger) {
const alerts = loadAlertsConfig('alerts.yaml');
try {
await checkAlerts(alerts, logger);
logger.info('All alerts passed.');
} catch (error) {
logger.error(error instanceof Error ? error.message : String(error));
process.exit(1); // Exit with error code
}
}

// Running as a jest test to use existing end to end test framework
describe('Alert Checker', () => {
const logger = createDebugLogger('aztec:alert-checker');
it('should check alerts', async () => {
await runAlertChecker(logger);
});
});
10 changes: 10 additions & 0 deletions yarn-project/end-to-end/src/quality_of_service/alerts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
## A set of alerts for the quality of service of the sequencer, these are tested for in certain e2e tests

## In end to end tests - page, will cause a test to fail
## Warning will write a message to the PR

alerts:
- alert: SequencerTimeToCollectAttestations
expr: aztec_sequencer_time_to_collect_attestations > 2500
labels:
severity: page
19 changes: 19 additions & 0 deletions yarn-project/sequencer-client/src/sequencer/metrics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ export class SequencerMetrics {
private currentBlockNumber: Gauge;
private currentBlockSize: Gauge;

private timeToCollectAttestations: Gauge;

constructor(client: TelemetryClient, getState: SequencerStateCallback, name = 'Sequencer') {
const meter = client.getMeter(name);
this.tracer = client.getTracer(name);
Expand Down Expand Up @@ -60,9 +62,26 @@ export class SequencerMetrics {
description: 'Current block number',
});

this.timeToCollectAttestations = meter.createGauge(Metrics.SEQUENCER_TIME_TO_COLLECT_ATTESTATIONS, {
description: 'The time spent collecting attestations from committee members',
});

this.setCurrentBlock(0, 0);
}

startCollectingAttestationsTimer(): () => void {
const startTime = Date.now();
const stop = () => {
const duration = Date.now() - startTime;
this.recordTimeToCollectAttestations(duration);
};
return stop.bind(this);
}

recordTimeToCollectAttestations(time: number) {
this.timeToCollectAttestations.record(time);
}

recordCancelledBlock() {
this.blockCounter.add(1, {
[Attributes.STATUS]: 'cancelled',
Expand Down
8 changes: 5 additions & 3 deletions yarn-project/sequencer-client/src/sequencer/sequencer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -633,11 +633,13 @@ export class Sequencer {
const txHashes = validTxs.map(tx => tx.getTxHash());

this.isFlushing = false;
this.log.info('Collecting attestations');
this.log.verbose('Collecting attestations');
const stopCollectingAttestationsTimer = this.metrics.startCollectingAttestationsTimer();
const attestations = await this.collectAttestations(block, txHashes);
this.log.info('Attestations collected');
this.log.verbose('Attestations collected');
stopCollectingAttestationsTimer();
this.log.verbose('Collecting proof quotes');

this.log.info('Collecting proof quotes');
const proofQuote = await this.createProofClaimForPreviousEpoch(newGlobalVariables.slotNumber.toBigInt());
this.log.info(proofQuote ? `Using proof quote ${inspect(proofQuote.payload)}` : 'No proof quote available');

Expand Down
1 change: 1 addition & 0 deletions yarn-project/telemetry-client/src/metrics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ export const SEQUENCER_BLOCK_COUNT = 'aztec.sequencer.block.count';
export const SEQUENCER_CURRENT_STATE = 'aztec.sequencer.current.state';
export const SEQUENCER_CURRENT_BLOCK_NUMBER = 'aztec.sequencer.current.block_number';
export const SEQUENCER_CURRENT_BLOCK_SIZE = 'aztec.sequencer.current.block_size';
export const SEQUENCER_TIME_TO_COLLECT_ATTESTATIONS = 'aztec.sequencer.time_to_collect_attestations';

export const L1_PUBLISHER_GAS_PRICE = 'aztec.l1_publisher.gas_price';
export const L1_PUBLISHER_TX_COUNT = 'aztec.l1_publisher.tx_count';
Expand Down
10 changes: 7 additions & 3 deletions yarn-project/telemetry-client/src/prom_otel_adapter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ interface IGauge<Labels extends LabelsGeneric = NoLabels> {
set: NoLabels extends Labels ? (value: number) => void : (labels: Labels, value: number) => void;

collect?(): void;
addCollect(fn: CollectFn<Labels>): void;
addCollect(collectFn: CollectFn<Labels>): void;
}

interface IHistogram<Labels extends LabelsGeneric = NoLabels> {
Expand Down Expand Up @@ -101,8 +101,12 @@ export class OtelGauge<Labels extends LabelsGeneric = NoLabels> implements IGaug
this.gauge.addCallback(this.handleObservation.bind(this));
}

addCollect(fn: CollectFn<Labels>): void {
this.collectFns.push(fn);
/**
* Add a collect callback
* @param collectFn - Callback function
*/
addCollect(collectFn: CollectFn<Labels>): void {
this.collectFns.push(collectFn);
}

handleObservation(result: any): void {
Expand Down
Loading

0 comments on commit 9cab121

Please sign in to comment.