-
Notifications
You must be signed in to change notification settings - Fork 684
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
subsystem benchmarks: add cpu profiling #2734
Changes from all commits
3975d00
b3bd11d
9226b91
5568eab
0c85296
c77075b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
services: | ||
grafana: | ||
image: grafana/grafana-enterprise:latest | ||
container_name: grafana | ||
restart: always | ||
networks: | ||
- subsystem-bench | ||
ports: | ||
- "3000:3000" | ||
|
||
prometheus: | ||
image: prom/prometheus:latest | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And here |
||
container_name: prometheus | ||
restart: always | ||
networks: | ||
- subsystem-bench | ||
volumes: | ||
- ./prometheus:/etc/prometheus | ||
extra_hosts: | ||
- "host.docker.internal:host-gateway" | ||
ports: | ||
- "9090:9090" | ||
- "9999:9999" | ||
|
||
pyroscope: | ||
container_name: pyroscope | ||
image: grafana/pyroscope:latest | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And here too |
||
restart: always | ||
networks: | ||
- subsystem-bench | ||
ports: | ||
- "4040:4040" | ||
|
||
networks: | ||
subsystem-bench: |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
global: | ||
scrape_interval: 5s | ||
|
||
scrape_configs: | ||
- job_name: "prometheus" | ||
static_configs: | ||
- targets: ["localhost:9090"] | ||
- job_name: "subsystem-bench" | ||
scrape_interval: 0s500ms | ||
static_configs: | ||
- targets: ['host.docker.internal:9999'] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
{ | ||
"annotations": { | ||
"list": [ | ||
{ | ||
"builtIn": 1, | ||
"datasource": { | ||
"type": "grafana", | ||
"uid": "-- Grafana --" | ||
}, | ||
"enable": true, | ||
"hide": true, | ||
"iconColor": "rgba(0, 211, 255, 1)", | ||
"name": "Annotations & Alerts", | ||
"type": "dashboard" | ||
} | ||
] | ||
}, | ||
"editable": true, | ||
"fiscalYearStartMonth": 0, | ||
"graphTooltip": 0, | ||
"id": 1, | ||
"links": [], | ||
"liveNow": false, | ||
"panels": [ | ||
{ | ||
"datasource": { | ||
"type": "grafana-pyroscope-datasource", | ||
"uid": "bc3bc04f-85f9-464b-8ae3-fbe0949063f6" | ||
}, | ||
"gridPos": { | ||
"h": 18, | ||
"w": 24, | ||
"x": 0, | ||
"y": 0 | ||
}, | ||
"id": 1, | ||
"targets": [ | ||
{ | ||
"datasource": { | ||
"type": "grafana-pyroscope-datasource", | ||
"uid": "bc3bc04f-85f9-464b-8ae3-fbe0949063f6" | ||
}, | ||
"groupBy": [], | ||
"labelSelector": "{service_name=\"subsystem-bench\"}", | ||
"profileTypeId": "process_cpu:cpu:nanoseconds:cpu:nanoseconds", | ||
"queryType": "profile", | ||
"refId": "A" | ||
} | ||
], | ||
"title": "CPU Profiling", | ||
"type": "flamegraph" | ||
} | ||
], | ||
"refresh": "", | ||
"schemaVersion": 38, | ||
"tags": [], | ||
"templating": { | ||
"list": [] | ||
}, | ||
"time": { | ||
"from": "now-6h", | ||
"to": "now" | ||
}, | ||
"timepicker": {}, | ||
"timezone": "", | ||
"title": "CPU Profiling", | ||
"uid": "c31191d5-fe2b-49e2-8b1c-1451f31d1628", | ||
"version": 1, | ||
"weekStart": "" | ||
} |
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -18,6 +18,8 @@ | |||
//! CI regression testing. | ||||
use clap::Parser; | ||||
use color_eyre::eyre; | ||||
use pyroscope::PyroscopeAgent; | ||||
use pyroscope_pprofrs::{pprof_backend, PprofConfig}; | ||||
|
||||
use colored::Colorize; | ||||
use std::{path::Path, time::Duration}; | ||||
|
@@ -76,12 +78,34 @@ struct BenchCli { | |||
/// Maximum remote peer latency in milliseconds [0-5000]. | ||||
pub peer_max_latency: Option<u64>, | ||||
|
||||
#[clap(long, default_value_t = false)] | ||||
/// Enable CPU Profiling with Pyroscope | ||||
pub profile: bool, | ||||
|
||||
#[clap(long, requires = "profile", default_value_t = String::from("http://localhost:4040"))] | ||||
/// Pyroscope Server URL | ||||
pub pyroscope_url: String, | ||||
|
||||
#[clap(long, requires = "profile", default_value_t = 113)] | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why 113 ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's the same value we use in the main polkadot's pyroscope. See: polkadot-sdk/polkadot/cli/src/command.rs Line 300 in ebe2aad
According to the PR it was added with, it has no a reason behind. Just a prime number, more than 100. |
||||
/// Pyroscope Sample Rate | ||||
pub pyroscope_sample_rate: u32, | ||||
|
||||
#[command(subcommand)] | ||||
pub objective: cli::TestObjective, | ||||
} | ||||
|
||||
impl BenchCli { | ||||
fn launch(self) -> eyre::Result<()> { | ||||
let agent_running = if self.profile { | ||||
let agent = PyroscopeAgent::builder(self.pyroscope_url.as_str(), "subsystem-bench") | ||||
.backend(pprof_backend(PprofConfig::new().sample_rate(self.pyroscope_sample_rate))) | ||||
.build()?; | ||||
|
||||
Some(agent.start()?) | ||||
} else { | ||||
None | ||||
}; | ||||
|
||||
let configuration = self.standard_configuration; | ||||
let mut test_config = match self.objective { | ||||
TestObjective::TestSequence(options) => { | ||||
|
@@ -165,6 +189,11 @@ impl BenchCli { | |||
env.runtime() | ||||
.block_on(availability::benchmark_availability_read(&mut env, state)); | ||||
|
||||
if let Some(agent_running) = agent_running { | ||||
let agent_ready = agent_running.stop()?; | ||||
agent_ready.shutdown(); | ||||
} | ||||
|
||||
Ok(()) | ||||
} | ||||
} | ||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Shouldn't we use an explicit version here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why? I think that yml is going to be updated much rarer than the grafana (and others) version. And it's only for local development, if someone has particular preferences they can change it locally.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
With
latest
you have no idea what version was used for a deployment, which is bad from reproducibility point of view. Also if a breaking change is introduced in any of the images we might run into issues.But if you say it's just for local development and we don't care for the above - fine by me :)