From 9cab1212e7040fb4c31db9bbb24f7f43413e8ed1 Mon Sep 17 00:00:00 2001 From: Maddiaa <47148561+Maddiaa0@users.noreply.github.com> Date: Mon, 25 Nov 2024 22:32:03 +0800 Subject: [PATCH] feat: e2e metrics reporting (#9776) --- .github/workflows/ci.yml | 1 + yarn-project/end-to-end/package.json | 3 + yarn-project/end-to-end/scripts/e2e_test.sh | 2 + .../end-to-end/scripts/e2e_test_config.yml | 2 +- .../scripts/e2e_test_with_alerts.sh | 51 +++++++++++ .../src/e2e_p2p/rediscovery.test.ts | 5 ++ .../end-to-end/src/e2e_p2p/reex.test.ts | 6 ++ .../end-to-end/src/e2e_p2p/reqresp.test.ts | 4 + .../upgrade_governance_proposer.test.ts | 4 + .../quality_of_service/alert_checker.test.ts | 88 +++++++++++++++++++ .../src/quality_of_service/alerts.yaml | 10 +++ .../sequencer-client/src/sequencer/metrics.ts | 19 ++++ .../src/sequencer/sequencer.ts | 8 +- yarn-project/telemetry-client/src/metrics.ts | 1 + .../telemetry-client/src/prom_otel_adapter.ts | 10 ++- yarn-project/yarn.lock | 9 ++ 16 files changed, 216 insertions(+), 7 deletions(-) create mode 100755 yarn-project/end-to-end/scripts/e2e_test_with_alerts.sh create mode 100644 yarn-project/end-to-end/src/quality_of_service/alert_checker.test.ts create mode 100644 yarn-project/end-to-end/src/quality_of_service/alerts.yaml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 65944a01641..45837c1954d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -208,6 +208,7 @@ jobs: uses: ./.github/ensure-tester-with-images env: USERNAME: ${{ needs.configure.outputs.username }} + PULL_REQUEST: ${{ github.event.pull_request.number }} with: runner_type: ${{ steps.runner_type.outputs.type }} builder_type: builder-x86 diff --git a/yarn-project/end-to-end/package.json b/yarn-project/end-to-end/package.json index 76c4307dbf6..d9514c42dfb 100644 --- a/yarn-project/end-to-end/package.json +++ b/yarn-project/end-to-end/package.json @@ -16,6 +16,7 @@ "formatting": "run -T prettier --check ./src \"!src/web/main.js\" && run -T eslint ./src", "formatting:fix": "run -T eslint --fix ./src && run -T prettier -w ./src", "test": "LOG_LEVEL=${LOG_LEVEL:-verbose} DEBUG_COLORS=1 NODE_NO_WARNINGS=1 node --experimental-vm-modules ../node_modules/.bin/jest --testTimeout=300000 --forceExit", + "test:with-alerts": "./scripts/test-with-alerts.sh", "test:profile": "LOG_LEVEL=${LOG_LEVEL:-verbose} DEBUG_COLORS=1 NODE_NO_WARNINGS=1 0x --output-dir \"flame_graph/{pid}.0x\" -- node --experimental-vm-modules ../node_modules/jest/bin/jest.js --runInBand --testTimeout=300000 --forceExit", "serve:flames": "python3 -m http.server --directory \"flame_graph\" 8000", "test:debug": "LOG_LEVEL=${LOG_LEVEL:-verbose} DEBUG_COLORS=1 NODE_NO_WARNINGS=1 node --inspect --experimental-vm-modules ../node_modules/.bin/jest --testTimeout=300000 --forceExit", @@ -99,10 +100,12 @@ "0x": "^5.7.0", "@jest/globals": "^29.5.0", "@types/jest": "^29.5.0", + "@types/js-yaml": "^4.0.9", "@types/lodash.chunk": "^4.2.9", "concurrently": "^7.6.0", "jest": "^29.5.0", "jest-extended": "^4.0.2", + "js-yaml": "^4.1.0", "ts-node": "^10.9.1", "typescript": "^5.0.4" }, diff --git a/yarn-project/end-to-end/scripts/e2e_test.sh b/yarn-project/end-to-end/scripts/e2e_test.sh index 8422d7d82c9..9670a51f5fe 100755 --- a/yarn-project/end-to-end/scripts/e2e_test.sh +++ b/yarn-project/end-to-end/scripts/e2e_test.sh @@ -50,6 +50,8 @@ fi # Check if the test uses docker compose if [ "$(echo "$test_config" | yq e '.use_compose // false' -)" = "true" ]; then $(dirname "$0")/e2e_compose_test.sh "$test_path" "$@" || [ "$ignore_failures" = "true" ] +elif [ "$(echo "$test_config" | yq e '.with_alerts // false' -)" = "true" ]; then + $(dirname "$0")/e2e_test_with_alerts.sh "$test_path" "$@" || [ "$ignore_failures" = "true" ] else # Set environment variables while IFS='=' read -r key value; do diff --git a/yarn-project/end-to-end/scripts/e2e_test_config.yml b/yarn-project/end-to-end/scripts/e2e_test_config.yml index b70bd9024d2..04f2212d5a1 100644 --- a/yarn-project/end-to-end/scripts/e2e_test_config.yml +++ b/yarn-project/end-to-end/scripts/e2e_test_config.yml @@ -85,9 +85,9 @@ tests: e2e_token_contract: {} e2e_p2p_gossip: test_path: 'e2e_p2p/gossip_network.test.ts' + with_alerts: true e2e_p2p_upgrade_governance_proposer: test_path: 'e2e_p2p/upgrade_governance_proposer.test.ts' - # https://github.com/AztecProtocol/aztec-packages/issues/9843 e2e_p2p_rediscovery: test_path: 'e2e_p2p/rediscovery.test.ts' e2e_p2p_reqresp: diff --git a/yarn-project/end-to-end/scripts/e2e_test_with_alerts.sh b/yarn-project/end-to-end/scripts/e2e_test_with_alerts.sh new file mode 100755 index 00000000000..a4f5d9cfbc3 --- /dev/null +++ b/yarn-project/end-to-end/scripts/e2e_test_with_alerts.sh @@ -0,0 +1,51 @@ +#! /bin/bash +## Run an end to end test with alerts + +# This will run an end to end test running the otel-lgtm stack (otel-collector, grafana, prometheus, tempo and loki) +# Then check the test against a set of alerts defined in the alerts.yaml file +# Note: these tests must run with METRICS enabled + +# Usage: ./e2e_test_with_alerts.sh <...extra-args> +# Example: ./e2e_test_with_alerts.sh gossip_network + +set -e + +test_path=$1 + +echo "Running otel stack" +CONTAINER_ID=$(docker run -d -p 3000:3000 -p 4317:4317 -p 4318:4318 --rm grafana/otel-lgtm) + +trap "docker stop $CONTAINER_ID" EXIT SIGINT SIGTERM + +echo "Waiting for LGTM stack to be ready..." +timeout=90 +while [ $timeout -gt 0 ]; do + if docker logs $CONTAINER_ID 2>&1 | grep -q "The OpenTelemetry collector and the Grafana LGTM stack are up and running"; then + echo "LGTM stack is ready!" + break + fi + sleep 1 + ((timeout--)) +done + +if [ $timeout -eq 0 ]; then + echo "Timeout waiting for LGTM stack to be ready" + docker stop $CONTAINER_ID + exit 1 +fi + +## Pass through run the existing e2e test +docker run \ + --network host \ + -e HARDWARE_CONCURRENCY="$HARDWARE_CONCURRENCY" \ + -e FAKE_PROOFS="$FAKE_PROOFS" \ + -e METRICS_PORT="4318" \ + -e COLLECT_METRICS="true" \ + -e PULL_REQUEST="$PULL_REQUEST" \ + $env_args \ + --rm aztecprotocol/end-to-end:$AZTEC_DOCKER_TAG \ + "$test_path" "$@" || [ "$ignore_failures" = "true" ] + + +echo "Running alert checker..." +docker run --network host --rm aztecprotocol/end-to-end:$AZTEC_DOCKER_TAG quality_of_service/alert_checker.test.ts diff --git a/yarn-project/end-to-end/src/e2e_p2p/rediscovery.test.ts b/yarn-project/end-to-end/src/e2e_p2p/rediscovery.test.ts index bf3879d248c..9d3b5b4c3a2 100644 --- a/yarn-project/end-to-end/src/e2e_p2p/rediscovery.test.ts +++ b/yarn-project/end-to-end/src/e2e_p2p/rediscovery.test.ts @@ -3,6 +3,7 @@ import { sleep } from '@aztec/aztec.js'; import fs from 'fs'; +import { shouldCollectMetrics } from '../fixtures/fixtures.js'; import { type NodeContext, createNode, createNodes } from '../fixtures/setup_p2p_test.js'; import { P2PNetworkTest, WAIT_FOR_TX_TIMEOUT } from './p2p_network.js'; import { createPXEServiceAndSubmitTransactions } from './shared.js'; @@ -23,6 +24,8 @@ describe('e2e_p2p_rediscovery', () => { testName: 'e2e_p2p_rediscovery', numberOfNodes: NUM_NODES, basePort: BOOT_NODE_UDP_PORT, + // To collect metrics - run in aztec-packages `docker compose --profile metrics up` and set COLLECT_METRICS=true + metricsPort: shouldCollectMetrics(), }); await t.applyBaseSnapshots(); await t.setup(); @@ -48,6 +51,8 @@ describe('e2e_p2p_rediscovery', () => { NUM_NODES, BOOT_NODE_UDP_PORT, DATA_DIR, + // To collect metrics - run in aztec-packages `docker compose --profile metrics up` + shouldCollectMetrics(), ); // wait a bit for peers to discover each other diff --git a/yarn-project/end-to-end/src/e2e_p2p/reex.test.ts b/yarn-project/end-to-end/src/e2e_p2p/reex.test.ts index 631f2910596..fcb1700fc83 100644 --- a/yarn-project/end-to-end/src/e2e_p2p/reex.test.ts +++ b/yarn-project/end-to-end/src/e2e_p2p/reex.test.ts @@ -7,6 +7,7 @@ import { BlockProposal, SignatureDomainSeperator, getHashedSignaturePayload } fr import { beforeAll, describe, it, jest } from '@jest/globals'; import fs from 'fs'; +import { shouldCollectMetrics } from '../fixtures/fixtures.js'; import { createNodes } from '../fixtures/setup_p2p_test.js'; import { P2PNetworkTest } from './p2p_network.js'; import { submitComplexTxsTo } from './shared.js'; @@ -28,6 +29,8 @@ describe('e2e_p2p_reex', () => { testName: 'e2e_p2p_reex', numberOfNodes: NUM_NODES, basePort: BOOT_NODE_UDP_PORT, + // To collect metrics - run in aztec-packages `docker compose --profile metrics up` and set COLLECT_METRICS=true + metricsPort: shouldCollectMetrics(), }); t.logger.verbose('Setup account'); @@ -66,6 +69,9 @@ describe('e2e_p2p_reex', () => { t.bootstrapNodeEnr, NUM_NODES, BOOT_NODE_UDP_PORT, + DATA_DIR, + // To collect metrics - run in aztec-packages `docker compose --profile metrics up` and set COLLECT_METRICS=true + shouldCollectMetrics(), ); // Hook into the node and intercept re-execution logic, ensuring that it was infact called diff --git a/yarn-project/end-to-end/src/e2e_p2p/reqresp.test.ts b/yarn-project/end-to-end/src/e2e_p2p/reqresp.test.ts index 1f0d20c04d3..cba52f3d477 100644 --- a/yarn-project/end-to-end/src/e2e_p2p/reqresp.test.ts +++ b/yarn-project/end-to-end/src/e2e_p2p/reqresp.test.ts @@ -6,6 +6,7 @@ import { jest } from '@jest/globals'; import fs from 'fs'; import { getContract } from 'viem'; +import { shouldCollectMetrics } from '../fixtures/fixtures.js'; import { type NodeContext, createNodes } from '../fixtures/setup_p2p_test.js'; import { P2PNetworkTest, WAIT_FOR_TX_TIMEOUT } from './p2p_network.js'; import { createPXEServiceAndSubmitTransactions } from './shared.js'; @@ -26,6 +27,8 @@ describe('e2e_p2p_reqresp_tx', () => { testName: 'e2e_p2p_reqresp_tx', numberOfNodes: NUM_NODES, basePort: BOOT_NODE_UDP_PORT, + // To collect metrics - run in aztec-packages `docker compose --profile metrics up` + metricsPort: shouldCollectMetrics(), }); await t.applyBaseSnapshots(); await t.setup(); @@ -67,6 +70,7 @@ describe('e2e_p2p_reqresp_tx', () => { NUM_NODES, BOOT_NODE_UDP_PORT, DATA_DIR, + shouldCollectMetrics(), ); // wait a bit for peers to discover each other diff --git a/yarn-project/end-to-end/src/e2e_p2p/upgrade_governance_proposer.test.ts b/yarn-project/end-to-end/src/e2e_p2p/upgrade_governance_proposer.test.ts index 20ebfba62fc..0645911102e 100644 --- a/yarn-project/end-to-end/src/e2e_p2p/upgrade_governance_proposer.test.ts +++ b/yarn-project/end-to-end/src/e2e_p2p/upgrade_governance_proposer.test.ts @@ -12,6 +12,7 @@ import { import fs from 'fs'; import { getAddress, getContract } from 'viem'; +import { shouldCollectMetrics } from '../fixtures/fixtures.js'; import { createNodes } from '../fixtures/setup_p2p_test.js'; import { P2PNetworkTest } from './p2p_network.js'; @@ -36,6 +37,8 @@ describe('e2e_p2p_governance_proposer', () => { testName: 'e2e_p2p_gerousia', numberOfNodes: NUM_NODES, basePort: BOOT_NODE_UDP_PORT, + // To collect metrics - run in aztec-packages `docker compose --profile metrics up` + metricsPort: shouldCollectMetrics(), }); await t.applyBaseSnapshots(); await t.setup(); @@ -132,6 +135,7 @@ describe('e2e_p2p_governance_proposer', () => { NUM_NODES, BOOT_NODE_UDP_PORT, DATA_DIR, + shouldCollectMetrics(), ); await sleep(4000); diff --git a/yarn-project/end-to-end/src/quality_of_service/alert_checker.test.ts b/yarn-project/end-to-end/src/quality_of_service/alert_checker.test.ts new file mode 100644 index 00000000000..108401c03ec --- /dev/null +++ b/yarn-project/end-to-end/src/quality_of_service/alert_checker.test.ts @@ -0,0 +1,88 @@ +import { type DebugLogger, createDebugLogger } from '@aztec/aztec.js'; +import { fileURLToPath } from '@aztec/foundation/url'; + +import * as fs from 'fs'; +import * as yaml from 'js-yaml'; +import { dirname, join } from 'path'; + +const GRAFANA_ENDPOINT = 'http://localhost:3000/api/datasources/proxy/uid/prometheus/api/v1/query'; +interface AlertConfig { + alert: string; + expr: string; + for: string; + labels: Record; + annotations: Record; +} +// Define __dirname for ES modules +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +// Load YAML configuration +function loadAlertsConfig(filePath: string): AlertConfig[] { + const fileContents = fs.readFileSync(join(__dirname, filePath), 'utf8'); + const data = yaml.load(fileContents) as { alerts: AlertConfig[] }; + return data.alerts; +} + +// Function to query Grafana based on an expression +async function queryGrafana(expr: string): Promise { + // Create base64 encoded credentials for basic auth + const credentials = Buffer.from('admin:admin').toString('base64'); + + const response = await fetch(`${GRAFANA_ENDPOINT}?query=${encodeURIComponent(expr)}`, { + headers: { + Authorization: `Basic ${credentials}`, + }, + }); + + if (!response.ok) { + throw new Error(`Failed to fetch data from Grafana: ${response.statusText}`); + } + + const data = await response.json(); + const result = data.data.result; + return result.length > 0 ? parseFloat(result[0].value[1]) : 0; +} + +// Function to check alerts based on expressions +async function checkAlerts(alerts: AlertConfig[], logger: DebugLogger) { + let alertTriggered = false; + + for (const alert of alerts) { + logger.info(`Checking alert: ${JSON.stringify(alert)}`); + + const metricValue = await queryGrafana(alert.expr); + logger.info(`Metric value: ${metricValue}`); + if (metricValue > 0) { + logger.error(`Alert ${alert.alert} triggered! Value: ${metricValue}`); + alertTriggered = true; + } else { + logger.info(`Alert ${alert.alert} passed.`); + } + } + + // If any alerts have been triggered we fail the test + if (alertTriggered) { + throw new Error('Test failed due to triggered alert'); + } +} + +// Main function to run tests +async function runAlertChecker(logger: DebugLogger) { + const alerts = loadAlertsConfig('alerts.yaml'); + try { + await checkAlerts(alerts, logger); + logger.info('All alerts passed.'); + } catch (error) { + logger.error(error instanceof Error ? error.message : String(error)); + process.exit(1); // Exit with error code + } +} + +// Running as a jest test to use existing end to end test framework +describe('Alert Checker', () => { + const logger = createDebugLogger('aztec:alert-checker'); + it('should check alerts', async () => { + await runAlertChecker(logger); + }); +}); diff --git a/yarn-project/end-to-end/src/quality_of_service/alerts.yaml b/yarn-project/end-to-end/src/quality_of_service/alerts.yaml new file mode 100644 index 00000000000..44a8a3ffaf8 --- /dev/null +++ b/yarn-project/end-to-end/src/quality_of_service/alerts.yaml @@ -0,0 +1,10 @@ +## A set of alerts for the quality of service of the sequencer, these are tested for in certain e2e tests + +## In end to end tests - page, will cause a test to fail +## Warning will write a message to the PR + +alerts: + - alert: SequencerTimeToCollectAttestations + expr: aztec_sequencer_time_to_collect_attestations > 2500 + labels: + severity: page diff --git a/yarn-project/sequencer-client/src/sequencer/metrics.ts b/yarn-project/sequencer-client/src/sequencer/metrics.ts index 15c391f6357..ea0d14bb6dd 100644 --- a/yarn-project/sequencer-client/src/sequencer/metrics.ts +++ b/yarn-project/sequencer-client/src/sequencer/metrics.ts @@ -21,6 +21,8 @@ export class SequencerMetrics { private currentBlockNumber: Gauge; private currentBlockSize: Gauge; + private timeToCollectAttestations: Gauge; + constructor(client: TelemetryClient, getState: SequencerStateCallback, name = 'Sequencer') { const meter = client.getMeter(name); this.tracer = client.getTracer(name); @@ -60,9 +62,26 @@ export class SequencerMetrics { description: 'Current block number', }); + this.timeToCollectAttestations = meter.createGauge(Metrics.SEQUENCER_TIME_TO_COLLECT_ATTESTATIONS, { + description: 'The time spent collecting attestations from committee members', + }); + this.setCurrentBlock(0, 0); } + startCollectingAttestationsTimer(): () => void { + const startTime = Date.now(); + const stop = () => { + const duration = Date.now() - startTime; + this.recordTimeToCollectAttestations(duration); + }; + return stop.bind(this); + } + + recordTimeToCollectAttestations(time: number) { + this.timeToCollectAttestations.record(time); + } + recordCancelledBlock() { this.blockCounter.add(1, { [Attributes.STATUS]: 'cancelled', diff --git a/yarn-project/sequencer-client/src/sequencer/sequencer.ts b/yarn-project/sequencer-client/src/sequencer/sequencer.ts index 17e371c0738..0ccd5c4ca23 100644 --- a/yarn-project/sequencer-client/src/sequencer/sequencer.ts +++ b/yarn-project/sequencer-client/src/sequencer/sequencer.ts @@ -633,11 +633,13 @@ export class Sequencer { const txHashes = validTxs.map(tx => tx.getTxHash()); this.isFlushing = false; - this.log.info('Collecting attestations'); + this.log.verbose('Collecting attestations'); + const stopCollectingAttestationsTimer = this.metrics.startCollectingAttestationsTimer(); const attestations = await this.collectAttestations(block, txHashes); - this.log.info('Attestations collected'); + this.log.verbose('Attestations collected'); + stopCollectingAttestationsTimer(); + this.log.verbose('Collecting proof quotes'); - this.log.info('Collecting proof quotes'); const proofQuote = await this.createProofClaimForPreviousEpoch(newGlobalVariables.slotNumber.toBigInt()); this.log.info(proofQuote ? `Using proof quote ${inspect(proofQuote.payload)}` : 'No proof quote available'); diff --git a/yarn-project/telemetry-client/src/metrics.ts b/yarn-project/telemetry-client/src/metrics.ts index 2761cfa6afd..f61e5be54ea 100644 --- a/yarn-project/telemetry-client/src/metrics.ts +++ b/yarn-project/telemetry-client/src/metrics.ts @@ -53,6 +53,7 @@ export const SEQUENCER_BLOCK_COUNT = 'aztec.sequencer.block.count'; export const SEQUENCER_CURRENT_STATE = 'aztec.sequencer.current.state'; export const SEQUENCER_CURRENT_BLOCK_NUMBER = 'aztec.sequencer.current.block_number'; export const SEQUENCER_CURRENT_BLOCK_SIZE = 'aztec.sequencer.current.block_size'; +export const SEQUENCER_TIME_TO_COLLECT_ATTESTATIONS = 'aztec.sequencer.time_to_collect_attestations'; export const L1_PUBLISHER_GAS_PRICE = 'aztec.l1_publisher.gas_price'; export const L1_PUBLISHER_TX_COUNT = 'aztec.l1_publisher.tx_count'; diff --git a/yarn-project/telemetry-client/src/prom_otel_adapter.ts b/yarn-project/telemetry-client/src/prom_otel_adapter.ts index 23e8e610bac..ffff02bb1ac 100644 --- a/yarn-project/telemetry-client/src/prom_otel_adapter.ts +++ b/yarn-project/telemetry-client/src/prom_otel_adapter.ts @@ -28,7 +28,7 @@ interface IGauge { set: NoLabels extends Labels ? (value: number) => void : (labels: Labels, value: number) => void; collect?(): void; - addCollect(fn: CollectFn): void; + addCollect(collectFn: CollectFn): void; } interface IHistogram { @@ -101,8 +101,12 @@ export class OtelGauge implements IGaug this.gauge.addCallback(this.handleObservation.bind(this)); } - addCollect(fn: CollectFn): void { - this.collectFns.push(fn); + /** + * Add a collect callback + * @param collectFn - Callback function + */ + addCollect(collectFn: CollectFn): void { + this.collectFns.push(collectFn); } handleObservation(result: any): void { diff --git a/yarn-project/yarn.lock b/yarn-project/yarn.lock index 5acda1df8ca..57c3aa91fda 100644 --- a/yarn-project/yarn.lock +++ b/yarn-project/yarn.lock @@ -553,6 +553,7 @@ __metadata: "@swc/jest": ^0.2.36 "@types/fs-extra": ^11.0.2 "@types/jest": ^29.5.0 + "@types/js-yaml": ^4.0.9 "@types/koa": ^2.13.9 "@types/koa-static": ^4.0.2 "@types/levelup": ^5.1.2 @@ -570,6 +571,7 @@ __metadata: jest: ^29.5.0 jest-extended: ^4.0.2 jest-mock-extended: ^3.0.5 + js-yaml: ^4.1.0 koa: ^2.14.2 koa-static: ^5.0.0 levelup: ^5.1.1 @@ -4687,6 +4689,13 @@ __metadata: languageName: node linkType: hard +"@types/js-yaml@npm:^4.0.9": + version: 4.0.9 + resolution: "@types/js-yaml@npm:4.0.9" + checksum: e5e5e49b5789a29fdb1f7d204f82de11cb9e8f6cb24ab064c616da5d6e1b3ccfbf95aa5d1498a9fbd3b9e745564e69b4a20b6c530b5a8bbb2d4eb830cda9bc69 + languageName: node + linkType: hard + "@types/json-schema@npm:*, @types/json-schema@npm:^7.0.12, @types/json-schema@npm:^7.0.8, @types/json-schema@npm:^7.0.9": version: 7.0.15 resolution: "@types/json-schema@npm:7.0.15"