dbt-labs · nathaniel-may · Apr 12, 2022 · Jan 20, 2022 · Jan 20, 2022 · Jan 20, 2022
@@ -1,175 +1,151 @@
 name: Performance Regression Tests
 # Schedule triggers
 on:
+ # TODO THIS IS FOR DEV ONLY:
+ pull_request:
  # runs twice a day at 10:05am and 10:05pm
  schedule:
  - cron: "5 10,22 * * *"
  # Allows you to run this workflow manually from the Actions tab
  workflow_dispatch:
 
+env:
+ RUNNER_CACHE_PATH: performance/runner/target/release/runner
+
 jobs:
- # checks fmt of runner code
- # purposefully not a dependency of any other job
- # will block merging, but not prevent developing
- fmt:
- name: Cargo fmt
+ latest-runner:
+ name: Build Runner or Use Cached
  runs-on: ubuntu-latest
+ env:
+ RUSTFLAGS: "-D warnings"
  steps:
- - uses: actions/checkout@v2
- - uses: actions-rs/toolchain@v1
+
+ - name: Checkout
+ uses: actions/checkout@v2
+
+ # attempts to access a previously cached runner
+ - uses: actions/cache@v2
+ id: cache
+ with:
+ path: ${{ env.RUNNER_CACHE_PATH }}
+ key: ${{ runner.os }}-${{ hashFiles('performance/runner/Cargo.toml')}}-${{ hashFiles('performance/runner/src/*') }}
+
+ - name: Fetch Rust Toolchain
+ if: steps.cache.outputs.cache-hit != 'true'
+ uses: actions-rs/toolchain@v1
  with:
  profile: minimal
  toolchain: stable
  override: true
- - run: rustup component add rustfmt
- - uses: actions-rs/cargo@v1
+
+ - name: Add fmt
+ if: steps.cache.outputs.cache-hit != 'true'
+ run: rustup component add rustfmt
+
+ - name: Cargo fmt
+ if: steps.cache.outputs.cache-hit != 'true'
+ uses: actions-rs/cargo@v1
  with:
  command: fmt
  args: --manifest-path performance/runner/Cargo.toml --all -- --check
 
- # runs any tests associated with the runner
- # these tests make sure the runner logic is correct
- test-runner:
- name: Test Runner
- runs-on: ubuntu-latest
- env:
- # turns errors into warnings
- RUSTFLAGS: "-D warnings"
- steps:
- - uses: actions/checkout@v2
- - uses: actions-rs/toolchain@v1
- with:
- profile: minimal
- toolchain: stable
- override: true
- - uses: actions-rs/cargo@v1
+ - name: Test
+ if: steps.cache.outputs.cache-hit != 'true'
+ uses: actions-rs/cargo@v1
  with:
  command: test
  args: --manifest-path performance/runner/Cargo.toml
 
- # build an optimized binary to be used as the runner in later steps
- build-runner:
- needs: [test-runner]
- name: Build Runner
- runs-on: ubuntu-latest
- env:
- RUSTFLAGS: "-D warnings"
- steps:
- - uses: actions/checkout@v2
- - uses: actions-rs/toolchain@v1
- with:
- profile: minimal
- toolchain: stable
- override: true
- - uses: actions-rs/cargo@v1
+ - name: Build (optimized)
+ if: steps.cache.outputs.cache-hit != 'true'
+ uses: actions-rs/cargo@v1
  with:
  command: build
  args: --release --manifest-path performance/runner/Cargo.toml
- - uses: actions/upload-artifact@v2
- with:
- name: runner
- path: performance/runner/target/release/runner
+ # the cache action automatically caches this binary at the end of the job
 
  # run the performance measurements on the current or default branch
  measure-dev:
- needs: [build-runner]
+ needs: [latest-runner]
  name: Measure Dev Branch
  runs-on: ubuntu-latest
  steps:
- - name: checkout dev
+
+ - name: Checkout Dev Branch
  uses: actions/checkout@v2
+
  - name: Setup Python
  uses: actions/setup-python@v2.2.2
  with:
  python-version: "3.8"
- - name: install dbt
+
+ - name: Install dbt
  run: pip install -r dev-requirements.txt -r editable-requirements.txt
- - name: install hyperfine
+
+ - name: Install Hyperfine
  run: wget https://github.com/sharkdp/hyperfine/releases/download/v1.11.0/hyperfine_1.11.0_amd64.deb && sudo dpkg -i hyperfine_1.11.0_amd64.deb
- - uses: actions/download-artifact@v2
+
+ # runner was just accessed or built so it should always be there
+ - name: Fetch Runner From Cache
+ uses: actions/cache@v2
+ id: cache
  with:
- name: runner
- - name: change permissions
+ path: ${{ env.RUNNER_CACHE_PATH }}
+ key: ${{ runner.os }}-${{ hashFiles('performance/runner/Cargo.toml')}}-${{ hashFiles('performance/runner/src/*') }}
+
+ - name: Move Runner
+ run: mv performance/runner/target/release/runner ./
+
+ - name: Change Runner Permissions
  run: chmod +x ./runner
- - name: run
+
+ # `${{ github.workspace }}` is used to pass the absolute path
+ - name: Run Measurement
  run: ./runner measure -b dev -p ${{ github.workspace }}/performance/projects/
- - uses: actions/upload-artifact@v2
- with:
- name: dev-results
- path: performance/results/
 
- # run the performance measurements on the release branch which we use
- # as a performance baseline. This part takes by far the longest, so
- # we do everything we can first so the job fails fast.
- # -----
- # we need to checkout dbt twice in this job: once for the baseline dbt
- # version, and once to get the latest regression testing projects,
- # metrics, and runner code from the develop or current branch so that
- # the calculations match for both versions of dbt we are comparing.
- measure-baseline:
- needs: [build-runner]
- name: Measure Baseline Branch
- runs-on: ubuntu-latest
- steps:
- - name: checkout latest
- uses: actions/checkout@v2
- with:
- ref: "0.20.latest"
- - name: Setup Python
- uses: actions/setup-python@v2.2.2
- with:
- python-version: "3.8"
- - name: move repo up a level
- run: mkdir ${{ github.workspace }}/../baseline/ && cp -r ${{ github.workspace }} ${{ github.workspace }}/../baseline
- - name: "[debug] ls new dbt location"
- run: ls ${{ github.workspace }}/../baseline/dbt/
- # installation creates egg-links so we have to preserve source
- - name: install dbt from new location
- run: cd ${{ github.workspace }}/../baseline/dbt/ && pip install -r dev-requirements.txt -r editable-requirements.txt
- # checkout the current branch to get all the target projects
- # this deletes the old checked out code which is why we had to copy before
- - name: checkout dev
- uses: actions/checkout@v2
- - name: install hyperfine
- run: wget https://github.com/sharkdp/hyperfine/releases/download/v1.11.0/hyperfine_1.11.0_amd64.deb && sudo dpkg -i hyperfine_1.11.0_amd64.deb
- - uses: actions/download-artifact@v2
- with:
- name: runner
- - name: change permissions
- run: chmod +x ./runner
- - name: run runner
- run: ./runner measure -b baseline -p ${{ github.workspace }}/performance/projects/
- - uses: actions/upload-artifact@v2
+ - name: Upload Results
+ uses: actions/upload-artifact@v2
  with:
- name: baseline-results
+ name: dev-results
  path: performance/results/
 
  # detect regressions on the output generated from measuring
  # the two branches. Exits with non-zero code if a regression is detected.
  calculate-regressions:
- needs: [measure-dev, measure-baseline]
+ needs: [measure-dev]
  name: Compare Results
  runs-on: ubuntu-latest
  steps:
- - uses: actions/download-artifact@v2
+
+ - name: Download Dev Results
+ uses: actions/download-artifact@v2
  with:
  name: dev-results
- - uses: actions/download-artifact@v2
- with:
- name: baseline-results
- - name: "[debug] ls result files"
- run: ls
- - uses: actions/download-artifact@v2
+
+ # runner was just accessed or built so it should always be there
+ - uses: actions/cache@v2
+ id: cache
  with:
- name: runner
- - name: change permissions
+ path: ${{ env.RUNNER_CACHE_PATH }}
+ key: ${{ runner.os }}-${{ hashFiles('performance/runner/Cargo.toml')}}-${{ hashFiles('performance/runner/src/*') }}
+
+ - name: Move Runner
+ run: mv performance/runner/target/release/runner ./
+
+ - name: Change Runner Permissions
  run: chmod +x ./runner
- - name: make results directory
+
+ - name: Make Results Directory
  run: mkdir ./final-output/
- - name: run calculation
+
+ # TODO compare against baseline somehow
+
+ - name: Run Calculation
  run: ./runner calculate -r ./ -o ./final-output/
- # always attempt to upload the results even if there were regressions found
- - uses: actions/upload-artifact@v2
+
+ - name: Upload Results
+ uses: actions/upload-artifact@v2
+ # makes sure the upload step runs even if a regression was found
  if: ${{ always() }}
  with:
  name: final-calculations

@@ -1,18 +1,69 @@
 # Performance Regression Testing
-This directory includes dbt project setups to test on and a test runner written in Rust which runs specific dbt commands on each of the projects. Orchestration is done via the GitHub Action workflow in `/.github/workflows/performance.yml`. The workflow is scheduled to run every night, but it can also be triggered manually.
+This test suite samples the performance characteristics of individual commits against performance models for prior releases. Performance is measured in project-command pairs which are assumed to conform to a normal distribution. The sampling and comparison is effecient enough to run against PRs.
 
-The github workflow hardcodes our baseline branch for performance metrics as `0.20.latest`. As future versions become faster, this branch will be updated to hold us to those new standards.
+This directory includes dbt project setups that are known performance bottlenecks, and a runner written in Rust that runs specific dbt commands on each of the projects. Orchestration is done via the GitHub Action workflow in `/.github/workflows/performance.yml`.
 
-## Adding a new dbt project
+Performance baselines measured during our release process are committed to this directory via github action. (TODO make the file and name it here).
+
+## Investigating Regressions
+
+If your commit has failed one of the performance regression tests, it does not necessarily mean your commit has a performance regression. However, the observed runtime value was so much slower than the expected value that it was unlikely to be random noise. This means that any commit after the release it is being compared against through this failing commit might contain the cause. Start by investigating the failing commit and working your way backwards.
+
+## The Statistics
+Particle physicists need to be confident in declaring new discoveries, snack manufacturers need to be sure each snack is within the regulated margin of error for nutrition facts, and weight-rated climbing gear needs to be produced so you can trust your life to every unit that comes off the line. All of these use cases use the same kind of math to meet their needs: sigma-based p-values. This section will peel apart that math with the help of a physicist and walk through how we apply this approach to performance regression testing in this test suite.
+
+You are likely familiar with forming a hypothesis of the form "A and B are correlated" which is known as _the research hypothesis_. Additionally, it follows that the hypothesis "A and B are not correlated" is relevant and is known as _the null hypothesis_. When looking at data, we commonly use a _p-value_ to determine the significance of the data. Formally, a _p-value_ is the probability of obtaining data at least as extreme as the ones observed, if the null hypothesis is true. To refine this definition, The experimental partical physicist [Dr. Tommaso Dorigo](https://userswww.pd.infn.it/~dorigo/#about) has an excellent [glossary](https://www.science20.com/quantum_diaries_survivor/fundamental_glossary_higgs_broadcast-85365) of these terms that helps clarify: "'Extreme' is quite tricky instead: it depends on what is your 'alternate hypothesis' of reference, and what kind of departure it would produce on the studied statistic derived from the data. So 'extreme' will mean 'departing from the typical values expected for the null hypothesis, toward the values expected from the alternate hypothesis.'" In the context of performance regression testing, our research hypothesis is that "after commit A, the codebase includes a performance regression" which means we expect the runtime of our measured processes to be _slower_, not faster than the expected value.
+
+Given this definition of p-value, we need to explicitly call out the common tendancy to apply _probability inversion_ to our observations. To quote [Dr. Tommaso Dorigo](https://www.science20.com/quantum_diaries_survivor/fundamental_glossary_higgs_broadcast-85365) again, "If your ability on the long jump puts you in the 99.99% percentile, that does not mean that you are a kangaroo, and neither can one infer that the probability that you belong to the human race is 0.01%." Using our previously defined terms, the p-value is _not_ the probability that the null hypothesis _is true_.
+
+This brings us to calculating sigma values. Sigma refers to the standard deviation of a statistical model, which is used as a measurement of how far away an observed value is from the expected value. When we say that we have a "3 sigma result" we are saying that if the null hypothesis is true, this is a particularly unlikely observation. Not that the null hypothesis is true. Exactly how unlikely depends on what the expected values from our research hypothesis are. In the context of performance regression testing, if the null hypothesis is false, we are expecting the results to be _slower_ than the expected value not _slower or faster_. Looking at a normal distrubiton below, we can see that we only care about one _half_ of the distribution: the half where the values are slower than the expected value. This means that when we're calculating the p-value we are not including both sides of the normal distribution.
+
+![normal distibution](./images/normal.svg)
+
+Because of this, the following table describes the significance of each sigma level for our _one-sided_ hypothesis:
+
+| σ | p-value | scientific significance |
+| --- | -------------- | ----------------------- |
+| 1 σ | 1 in 6 | |
+| 2 σ | 1 in 44 | |
+| 3 σ | 1 in 741 | evidence |
+| 4 σ | 1 in 31,574 | |
+| 5 σ | 1 in 3,486,914 | discovery |
+
+When detecting performance regressions that trigger alerts, block PRs, or delay releases we want to be conservative enough that detections are infrequently triggered by noise, but not so conservative as to miss most actual regressions. This test suite uses a 3 sigma standard so that only about 1 in every 700 runs is expected to fail the performance regression test suite due to expected variance in our measurements.
+
+### Concrete Example
+
+The following example data was collected locally on a macbook pro using the same tools included in this repository.
+
+In dbt v1.0.1, we have the following mean and standard deviation when parsing a dbt project with 2000 models:
+
+μ (mean): 49.82<br/>
+σ (stddev): 0.5212<br/>
+
+The 2-sided 3 sigma range can be calculated with these two values via:
+
+x < μ - 3 σ or x > μ + 3 σ<br/>
+x < 49.82 - 3 * 0.5212 or x > 49.82 + 3 * 0.5212 <br/>
+x < 48.26 or x > 51.38<br/>
+
+It follows that the 1-sided 3 sigma range for performance regressions is just:<br/>
+x > 51.38
+
+If when we sample a single `dbt parse` of the same project with a commit slated to go into dbt v1.0.2 on the same macbook pro under the same conditions, we observe a 52s parse time, then this observation is so unlikely if there were no code-induced performance regressions, that we should investigate if there is a performance regression in any of the commits between this failure and the commit where the initial distribution was measured.
+
+Observations with 3 sigma significance that are _not_ performance regressions could be due to observing unlikely values (1 in every 741 observations), or variations in the instruments we use to take these measurements such as github actions. At this time we do not measure the variation in the instruments we use to account for these in our calculations.
+
+## Expanding the Tests
+Regression tests run pre-defined dbt commands accross a set of source-committed dbt projects that are known to cause performance bottlenecks. This collection of projects and commands should expand over time to reflect user feedback about poorly performing projects to protect against poor performance in these scenarios in future versions.
+
+### Adding a new dbt project
 Just make a new directory under `performance/projects/`. It will automatically be picked up by the tests.
 
-## Adding a new dbt command
-In `runner/src/measure.rs::measure` add a metric to the `metrics` Vec. The Github Action will handle recompilation if you don't have the rust toolchain installed.
+### Adding a new dbt command
+TODO
 
 ## Future work
 - add more projects to test different configurations that have been known bottlenecks
 - add more dbt commands to measure
-- possibly using the uploaded json artifacts to store these results so they can be graphed over time
 - reading new metrics from a file so no one has to edit rust source to add them to the suite
-- instead of building the rust every time, we could publish and pull down the latest version.
-- instead of manually setting the baseline version of dbt to test, pull down the latest stable version as the baseline.