From dd57c963c7d2d1c3ab17bc447bb7a5a30f35e798 Mon Sep 17 00:00:00 2001 From: Matthew Keil Date: Thu, 12 Oct 2023 21:27:32 +0200 Subject: [PATCH] feat(beacon-node): network worker event latency metrics (#5800) * feat(beacon-node): pass metrics to workerEvents * feat(beacon-node): add types for network worker event metrics * feat(beacon-node): add metrics for network worker events * feat(beacon-node): add metric data to network worker events * fix(beacon-node): move async iterator timestamps to correct location * fix(beacon-node): Omit unnecessary emittedAt from parameter type * feat(dashboards): add network worker thread metrics to dashboard * fix(dashboard): change metric name back to lodestar_ prefix * fix: run check-types and update missed test types * Revert "feat(beacon-node): add types for network worker event metrics" This reverts commit e8dc6d29458d5c54c02a71c01f390f34118605ab. * Revert "fix(beacon-node): move async iterator timestamps to correct location" This reverts commit 8b3f6c606e0857e94275daa25381b77c4e8c8202. * Revert "fix(beacon-node): Omit unnecessary emittedAt from parameter type" This reverts commit b7fde56b5f903cc767a84a9f5ce861d867c8b480. * Revert "feat(beacon-node): add metric data to network worker events" This reverts commit 631f57071ebf5a5595f2c0d02c8ed365825ced61. * feat(beacon-node): capture worker message in hrTime * fix(dashboards): remove re-emit panel from network worker row * fix(metrics): remove unused re-emit metrics * Revert "fix: run check-types and update missed test types" This reverts commit 525d1577baa6ddf5050fc89f19de8fcdbeca2373. * fix(beacon-node): update metric name at call site * fix(metrics): update capture to ISU units * refactor(beacon-node): move initialization out of conditional * fix: remove unused import * feat(metrics): add unit to metric name * feat(metrics): add unit to metric name * feat: add trace log statement to network worker * fix: change trace to debug log * feat(metrics): add eventName to network worker message metrics * bug(logger): check if trace is broken * feat(metrics): add worker eventDirection label * fix(metrics): use string instead of enum for eventDirection * fix(metrics): remove eventDirection label * feat(dashboards): add average panel for network worker message * fix(metrics): update naming per Nico's suggestions * refactor: remove unused logger from workerEvents.ts * fix(metrics): add network worker unit name back * fix(beacon-node): use bigint for hrtime in worker message metric * fix(dashboards): remove dashboard changes. moved to PR#5827 * fix(dashboards): remove dashboard changes. moved to PR#5827 * fix(dashboards): remove dashboard changes. moved to PR#5827 * fix: constant case for nano conversion * Revert "fix(beacon-node): use bigint for hrtime in worker message metric" This reverts commit 59f12ff89cdc1cc98c33befc1e980d777419728a. * refactor: remove Sec suffix in metric variable name * fix(dashboard): make metric name match the updates in PR --- dashboards/lodestar_networking.json | 20 +++++++++--------- .../src/metrics/metrics/lodestar.ts | 6 ++++++ .../beacon-node/src/network/core/metrics.ts | 8 +++++++ .../src/network/core/networkCoreWorker.ts | 12 ++++++----- .../network/core/networkCoreWorkerHandler.ts | 2 ++ packages/beacon-node/src/util/workerEvents.ts | 21 +++++++++++++++++++ 6 files changed, 54 insertions(+), 15 deletions(-) diff --git a/dashboards/lodestar_networking.json b/dashboards/lodestar_networking.json index 7a18f218db7b..8633faeb7668 100644 --- a/dashboards/lodestar_networking.json +++ b/dashboards/lodestar_networking.json @@ -1,12 +1,12 @@ { "__inputs": [ { - "description": "", - "label": "Prometheus", "name": "DS_PROMETHEUS", + "type": "datasource", + "label": "Prometheus", + "description": "", "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" + "pluginName": "Prometheus" } ], "annotations": { @@ -937,7 +937,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "(\n sum(rate(\n lodestar_network_worker_wire_events_on_worker_thread_latency_sum[$rate_interval]\n )) \n +\n sum(rate(\n lodestar_network_worker_wire_events_on_main_thread_latency_sum[$rate_interval]\n ))\n)\n/\n(\n sum(rate(\n lodestar_network_worker_wire_events_on_worker_thread_latency_count[$rate_interval]\n ))\n +\n sum(rate(\n lodestar_network_worker_wire_events_on_main_thread_latency_count[$rate_interval]\n ))\n)", + "expr": "(\n sum(rate(\n lodestar_network_worker_wire_events_on_worker_thread_latency_seconds_sum[$rate_interval]\n )) \n +\n sum(rate(\n lodestar_network_worker_wire_events_on_main_thread_latency_seconds_sum[$rate_interval]\n ))\n)\n/\n(\n sum(rate(\n lodestar_network_worker_wire_events_on_worker_thread_latency_seconds_count[$rate_interval]\n ))\n +\n sum(rate(\n lodestar_network_worker_wire_events_on_main_thread_latency_seconds_count[$rate_interval]\n ))\n)", "hide": false, "interval": "", "legendFormat": "Average", @@ -951,7 +951,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "avg(rate(lodestar_network_worker_wire_events_on_worker_thread_latency_sum[$rate_interval])/rate(lodestar_network_worker_wire_events_on_worker_thread_latency_count[$rate_interval]))", + "expr": "avg(rate(lodestar_network_worker_wire_events_on_worker_thread_latency_seconds_sum[$rate_interval])/rate(lodestar_network_worker_wire_events_on_worker_thread_latency_seconds_count[$rate_interval]))", "hide": false, "interval": "", "legendFormat": "Worker to Main", @@ -964,7 +964,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(rate(lodestar_network_worker_wire_events_on_main_thread_latency_sum[$rate_interval])/rate(lodestar_network_worker_wire_events_on_main_thread_latency_count[$rate_interval]))", + "expr": "avg(rate(lodestar_network_worker_wire_events_on_main_thread_latency_seconds_sum[$rate_interval])/rate(lodestar_network_worker_wire_events_on_main_thread_latency_seconds_count[$rate_interval]))", "hide": false, "legendFormat": "Main to Worker", "range": true, @@ -1051,7 +1051,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "avg(rate(lodestar_network_worker_wire_events_on_main_thread_latency_sum[$rate_interval])/rate(lodestar_network_worker_wire_events_on_main_thread_latency_count[$rate_interval]))", + "expr": "avg(rate(lodestar_network_worker_wire_events_on_main_thread_latency_seconds_sum[$rate_interval])/rate(lodestar_network_worker_wire_events_on_main_thread_latency_seconds_count[$rate_interval]))", "hide": true, "interval": "", "legendFormat": "Average to Main", @@ -1065,7 +1065,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(lodestar_network_worker_wire_events_on_main_thread_latency_sum[$rate_interval])/rate(lodestar_network_worker_wire_events_on_main_thread_latency_count[$rate_interval])", + "expr": "rate(lodestar_network_worker_wire_events_on_main_thread_latency_seconds_sum[$rate_interval])/rate(lodestar_network_worker_wire_events_on_main_thread_latency_seconds_count[$rate_interval])", "interval": "", "legendFormat": "{{eventName}}", "range": true, @@ -1077,7 +1077,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "rate(lodestar_network_worker_wire_events_on_worker_thread_latency_sum[$rate_interval])/rate(lodestar_network_worker_wire_events_on_worker_thread_latency_count[$rate_interval])", + "expr": "rate(lodestar_network_worker_wire_events_on_worker_thread_latency_seconds_sum[$rate_interval])/rate(lodestar_network_worker_wire_events_on_worker_thread_latency_seconds_count[$rate_interval])", "hide": false, "legendFormat": "{{eventName}}", "range": true, diff --git a/packages/beacon-node/src/metrics/metrics/lodestar.ts b/packages/beacon-node/src/metrics/metrics/lodestar.ts index 8b8ce0f0c2bc..5ffae34a9eeb 100644 --- a/packages/beacon-node/src/metrics/metrics/lodestar.ts +++ b/packages/beacon-node/src/metrics/metrics/lodestar.ts @@ -121,6 +121,12 @@ export function createLodestarMetrics( help: "Current count of pending items in reqRespBridgeReqCaller data structure", }), }, + networkWorkerWireEventsOnMainThreadLatency: register.histogram<"eventName">({ + name: "lodestar_network_worker_wire_events_on_main_thread_latency_seconds", + help: "Latency in seconds to transmit network events to main thread across worker port", + labelNames: ["eventName"], + buckets: [0.001, 0.003, 0.01, 0.03, 0.1], + }), regenQueue: { length: register.gauge({ diff --git a/packages/beacon-node/src/network/core/metrics.ts b/packages/beacon-node/src/network/core/metrics.ts index 78bc88d52fe7..e5ce0bede447 100644 --- a/packages/beacon-node/src/network/core/metrics.ts +++ b/packages/beacon-node/src/network/core/metrics.ts @@ -333,6 +333,8 @@ export function createNetworkCoreMetrics(register: RegistryMetricCreator) { }; } +export type NetworkCoreWorkerMetrics = ReturnType; + // eslint-disable-next-line @typescript-eslint/explicit-function-return-type export function getNetworkCoreWorkerMetrics(register: RegistryMetricCreator) { return { @@ -340,5 +342,11 @@ export function getNetworkCoreWorkerMetrics(register: RegistryMetricCreator) { name: "lodestar_network_worker_reqresp_bridge_caller_pending_count", help: "Current count of pending elements in respBridgeCaller", }), + networkWorkerWireEventsOnWorkerThreadLatency: register.histogram<"eventName">({ + name: "lodestar_network_worker_wire_events_on_worker_thread_latency_seconds", + help: "Latency in seconds to transmit network events to worker thread across parent port", + labelNames: ["eventName"], + buckets: [0.001, 0.003, 0.01, 0.03, 0.1], + }), }; } diff --git a/packages/beacon-node/src/network/core/networkCoreWorker.ts b/packages/beacon-node/src/network/core/networkCoreWorker.ts index a0c8ff22fe60..35303190a8f8 100644 --- a/packages/beacon-node/src/network/core/networkCoreWorker.ts +++ b/packages/beacon-node/src/network/core/networkCoreWorker.ts @@ -13,6 +13,9 @@ import {peerIdToString} from "../../util/peerId.js"; import {profileNodeJS} from "../../util/profile.js"; import {NetworkEventBus, NetworkEventData, networkEventDirection} from "../events.js"; import {wireEventsOnWorkerThread} from "../../util/workerEvents.js"; +import {getNetworkCoreWorkerMetrics} from "./metrics.js"; +import {NetworkWorkerApi, NetworkWorkerData} from "./types.js"; +import {NetworkCore} from "./networkCore.js"; import { NetworkWorkerThreadEventType, ReqRespBridgeEventBus, @@ -21,9 +24,6 @@ import { getReqRespBridgeRespEvents, reqRespBridgeEventDirection, } from "./events.js"; -import {getNetworkCoreWorkerMetrics} from "./metrics.js"; -import {NetworkCore} from "./networkCore.js"; -import {NetworkWorkerApi, NetworkWorkerData} from "./types.js"; // Cloned data from instantiation const workerData = worker.workerData as NetworkWorkerData; @@ -83,9 +83,9 @@ new AsyncIterableBridgeHandler(getReqRespBridgeReqEvents(reqRespBridgeEventBus), ); const reqRespBridgeRespCaller = new AsyncIterableBridgeCaller(getReqRespBridgeRespEvents(reqRespBridgeEventBus)); +const networkCoreWorkerMetrics = metricsRegister ? getNetworkCoreWorkerMetrics(metricsRegister) : null; // respBridgeCaller metrics -if (metricsRegister) { - const networkCoreWorkerMetrics = getNetworkCoreWorkerMetrics(metricsRegister); +if (networkCoreWorkerMetrics) { networkCoreWorkerMetrics.reqRespBridgeRespCallerPending.addCollect(() => { networkCoreWorkerMetrics.reqRespBridgeRespCallerPending.set(reqRespBridgeRespCaller.pendingCount); }); @@ -110,12 +110,14 @@ wireEventsOnWorkerThread( NetworkWorkerThreadEventType.networkEvent, events, parentPort, + networkCoreWorkerMetrics, networkEventDirection ); wireEventsOnWorkerThread( NetworkWorkerThreadEventType.reqRespBridgeEvents, reqRespBridgeEventBus, parentPort, + networkCoreWorkerMetrics, reqRespBridgeEventDirection ); diff --git a/packages/beacon-node/src/network/core/networkCoreWorkerHandler.ts b/packages/beacon-node/src/network/core/networkCoreWorkerHandler.ts index 73ca9e9c5fd0..8c944dd87d2d 100644 --- a/packages/beacon-node/src/network/core/networkCoreWorkerHandler.ts +++ b/packages/beacon-node/src/network/core/networkCoreWorkerHandler.ts @@ -75,12 +75,14 @@ export class WorkerNetworkCore implements INetworkCore { NetworkWorkerThreadEventType.networkEvent, modules.events, modules.worker as unknown as worker_threads.Worker, + modules.metrics, networkEventDirection ); wireEventsOnMainThread( NetworkWorkerThreadEventType.reqRespBridgeEvents, this.reqRespBridgeEventBus, modules.worker as unknown as worker_threads.Worker, + modules.metrics, reqRespBridgeEventDirection ); diff --git a/packages/beacon-node/src/util/workerEvents.ts b/packages/beacon-node/src/util/workerEvents.ts index cd61e6b95393..807bf7a30618 100644 --- a/packages/beacon-node/src/util/workerEvents.ts +++ b/packages/beacon-node/src/util/workerEvents.ts @@ -2,11 +2,16 @@ import {MessagePort, Worker} from "node:worker_threads"; import {Thread} from "@chainsafe/threads"; import {Logger} from "@lodestar/logger"; import {sleep} from "@lodestar/utils"; +import {Metrics} from "../metrics/metrics.js"; +import {NetworkCoreWorkerMetrics} from "../network/core/metrics.js"; import {StrictEventEmitterSingleArg} from "./strictEvents.js"; +const NANO_TO_SECOND_CONVERSION = 1e9; + export type WorkerBridgeEvent = { type: string; event: keyof EventData; + posted: [number, number]; data: EventData[keyof EventData]; }; @@ -27,6 +32,7 @@ export function wireEventsOnWorkerThread( mainEventName: string, events: StrictEventEmitterSingleArg, parentPort: MessagePort, + metrics: NetworkCoreWorkerMetrics | null, isWorkerToMain: {[K in keyof EventData]: EventDirection} ): void { // Subscribe to events from main thread @@ -37,6 +43,12 @@ export function wireEventsOnWorkerThread( // This check is not necessary but added for safety in case of improper implemented events isWorkerToMain[data.event] === EventDirection.mainToWorker ) { + const [sec, nanoSec] = process.hrtime(data.posted); + const networkWorkerLatency = sec + nanoSec / NANO_TO_SECOND_CONVERSION; + metrics?.networkWorkerWireEventsOnWorkerThreadLatency.observe( + {eventName: data.event as string}, + networkWorkerLatency + ); events.emit(data.event, data.data); } }); @@ -48,6 +60,7 @@ export function wireEventsOnWorkerThread( const workerEvent: WorkerBridgeEvent = { type: mainEventName, event: eventName, + posted: process.hrtime(), data, }; parentPort.postMessage(workerEvent); @@ -60,6 +73,7 @@ export function wireEventsOnMainThread( mainEventName: string, events: StrictEventEmitterSingleArg, worker: Pick, + metrics: Metrics | null, isWorkerToMain: {[K in keyof EventData]: EventDirection} ): void { // Subscribe to events from main thread @@ -70,6 +84,12 @@ export function wireEventsOnMainThread( // This check is not necessary but added for safety in case of improper implemented events isWorkerToMain[data.event] === EventDirection.workerToMain ) { + const [sec, nanoSec] = process.hrtime(data.posted); + const networkWorkerLatency = sec + nanoSec / NANO_TO_SECOND_CONVERSION; + metrics?.networkWorkerWireEventsOnMainThreadLatency.observe( + {eventName: data.event as string}, + networkWorkerLatency + ); events.emit(data.event, data.data); } }); @@ -81,6 +101,7 @@ export function wireEventsOnMainThread( const workerEvent: WorkerBridgeEvent = { type: mainEventName, event: eventName, + posted: process.hrtime(), data, }; worker.postMessage(workerEvent);