.github/workflows/triton-benchmarks.yml

name: Triton benchmarks
run-name: ${{ inputs.run_name }}

on:
  workflow_dispatch:
    inputs:
      runner_label:
        description: Runner label, keep empty for default
        type: string
        default: ""
      tag:
        description: Tag for benchmark results
        type: string
        default: "test"
      benchmarking_method:
        description: The method used to obtain performance numbers
        type: choice
        options:
          - PYTORCH_LEGACY_PROFILER_USING_IPEX
          - ELAPSED_TIME
          - UPSTREAM_PYTORCH_PROFILER
        default: PYTORCH_LEGACY_PROFILER_USING_IPEX
      run_name:
        description: Run name
        type: string
        default: "Triton benchmarks"
      skip_benchmarks:
        description: JSON list of benchmarks to skip
        type: string
        default: "[]"
  schedule:
    - cron: "5 23 * * *"
  pull_request:
    branches:
      - main
    paths:
      - .github/workflows/triton-benchmarks.yml
      - benchmarks/**

permissions: read-all

env:
  PYTHON_VERSION: "3.10"
  BENCHMARKING_METHOD: ${{ inputs.benchmarking_method || 'PYTORCH_LEGACY_PROFILER_USING_IPEX' }}
  USE_IPEX: ${{ github.event_name != 'workflow_dispatch' && '1' || inputs.benchmarking_method  == 'PYTORCH_LEGACY_PROFILER_USING_IPEX' && '1' || '0' }}
  TAG: ${{ inputs.tag || (github.event_name == 'pull_request' && 'pr') || (github.event_name == 'schedule' && 'ci') || 'test' }}

jobs:
  build:
    name: Triton benchmarks
    runs-on:
      - ${{ inputs.runner_label || 'max1550' }}
    timeout-minutes: 720
    defaults:
      run:
        shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}"
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Load pip cache
        id: pip-cache
        uses: ./.github/actions/load
        with:
          path: $HOME/.cache/pip
          # pip cache per commit id just to minimize network traffic
          key: pip-$PYTHON_VERSION-$GITHUB_SHA

      - name: Install Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ env.PYTHON_VERSION }}

      - name: Install Python build dependencies
        run: |
          pip install wheel cmake

      - name: Setup PyTorch with IPEX
        if: ${{ env.USE_IPEX == '1' }}
        uses: ./.github/actions/setup-pytorch
        with:
          repository: Stonepia/pytorch

      - name: Setup PyTorch without IPEX
        if: ${{ env.USE_IPEX == '0' }}
        uses: ./.github/actions/setup-pytorch
        with:
          repository: pytorch/pytorch

      - name: Setup IPEX
        if: ${{ env.USE_IPEX == '1' }}
        uses: ./.github/actions/setup-ipex

      - name: Build Triton wheels
        uses: ./.github/actions/setup-triton
        with:
          command: DEBUG=1 python setup.py bdist_wheel

      - name: Install Triton
        run: |
          pip install python/dist/*.whl

      - name: Install benchmark dependencies
        run: |
          pip install matplotlib pandas tabulate

      - name: Create reports dir
        run: |
          mkdir reports
          echo "REPORTS=$PWD/reports" >> $GITHUB_ENV

      - name: Install benchmarks
        id: install
        run: |
          cd benchmarks
          python setup.py install

      - name: Run Triton Softmax kernel benchmark
        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          python fused_softmax.py --reports $REPORTS
          source ../../scripts/capture-hw-details.sh
          python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-triton-report.csv --benchmark softmax --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
          python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-xetla-report.csv --benchmark softmax --compiler xetla --param_cols "N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG

      - name: Run Triton GEMM kernel benchmark
        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py') }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          python gemm_benchmark.py --reports $REPORTS
          mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv

          source ../../scripts/capture-hw-details.sh
          python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
          python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-xetla-report.csv --benchmark gemm --compiler xetla --param_cols "B,M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG

      - name: Run Triton GEMM kernel benchmark - default path
        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_default') }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          # Default path:
          TRITON_INTEL_ADVANCED_PATH=0 \
          TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
          IGC_VISAOptions=" -enableBCR -nolocalra" \
          IGC_DisableLoopUnroll=1 \
          python gemm_benchmark.py --reports $REPORTS
          mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-default-path.csv

          source ../../scripts/capture-hw-details.sh
          TAG="${TAG}-dflt"
          python ../../scripts/build_report.py $REPORTS/matmul-performance-default-path.csv $REPORTS/gemm-triton-default-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

      - name: Run Triton GEMM kernel benchmark - advanced path
        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_advanced') }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          # Advanced path:
          TRITON_INTEL_ADVANCED_PATH=1 \
          TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
          IGC_VISAOptions=" -enableBCR -nolocalra" \
          IGC_DisableLoopUnroll=1 \
          python gemm_benchmark.py --reports $REPORTS
          mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-adv-path.csv

          source ../../scripts/capture-hw-details.sh
          TAG="${TAG}-adv"
          python ../../scripts/build_report.py $REPORTS/matmul-performance-adv-path.csv $REPORTS/gemm-triton-advanced-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

      - name: Run Triton GEMM (A@B^t) kernel benchmark
        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS
          mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv
          source ../../scripts/capture-hw-details.sh

          python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-triton-report.csv --benchmark gemm-bt --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
          python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-onednn-report.csv --benchmark gemm-bt --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG

      - name: Run Triton GEMM (A^t@B) kernel benchmark
        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS
          mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv
          source ../../scripts/capture-hw-details.sh

          python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-triton-report.csv --benchmark gemm-at --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
          python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-onednn-report.csv --benchmark gemm-at --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG

      - name: Run Triton GEMM (stream-k) kernel benchmark
        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          python gemm_streamk_benchmark.py --reports $REPORTS
          source ../../scripts/capture-hw-details.sh
          python ../../scripts/build_report.py $REPORTS/matmul-streamk-performance.csv $REPORTS/gemm-streamk-triton-report.csv --benchmark gemm-streamk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

      - name: Run Triton GEMM (split-k) kernel benchmark
        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          python gemm_splitk_benchmark.py --reports $REPORTS
          source ../../scripts/capture-hw-details.sh
          python ../../scripts/build_report.py $REPORTS/matmul-splitk-performance.csv $REPORTS/gemm-splitk-triton-report.csv --benchmark gemm-splitk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

      - name: Run Triton GEMM + PreOp (exp) kernel benchmark
        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          python gemm_preop_exp_benchmark.py --reports $REPORTS
          source ../../scripts/capture-hw-details.sh
          python ../../scripts/build_report.py $REPORTS/matmul-performance-preop-exp.csv $REPORTS/gemm-preop-exp-triton-report.csv --benchmark gemm-preop-exp --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

      - name: Run Triton GEMM + PostOp (Gelu) kernel benchmark
        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          python gemm_postop_gelu_benchmark.py --reports $REPORTS
          source ../../scripts/capture-hw-details.sh
          python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-gelu.csv $REPORTS/gemm-postop-gelu-triton-report.csv --benchmark gemm-postop-gelu --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

      - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark
        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark.py') }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          python gemm_postop_addmatrix_benchmark.py --reports $REPORTS
          source ../../scripts/capture-hw-details.sh
          python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-addmatrix.csv $REPORTS/gemm-postop-addmatrix-triton-report.csv --benchmark gemm-postop-addmatrix --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

      - name: Run Triton FA kernel benchmark
        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_fwd_benchmark.py') }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          python flash_attention_fwd_benchmark.py --reports $REPORTS

          source ../../scripts/capture-hw-details.sh
          python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
          python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG

      - name: Run Triton FA kernel benchmark - default path
        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmark || '[]'), 'flash_attention_fwd_benchmark.py_default') }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          TRITON_INTEL_ADVANCED_PATH=0 \
          TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
          IGC_VISAOptions=" -enableBCR" \
          python flash_attention_fwd_benchmark.py --reports $REPORTS

          TAG="${TAG}-dflt"
          source ../../scripts/capture-hw-details.sh
          python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-default-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

      - name: Run Triton FA kernel benchmark - advanced path
        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_fwd_benchmark.py_advanced') }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          TRITON_INTEL_ADVANCED_PATH=1 \
          TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
          IGC_VISAOptions=" -enableBCR" \
          python flash_attention_fwd_benchmark.py --reports $REPORTS

          TAG="${TAG}-adv"
          source ../../scripts/capture-hw-details.sh
          python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-advanced-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

      - name: Run Prefix Sums kernel benchmark
        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }}
        run: |
          cd benchmarks/triton_kernels_benchmark
          python prefix_sums.py --reports $REPORTS
          source ../../scripts/capture-hw-details.sh
          python ../../scripts/build_report.py $REPORTS/prefix-sums.csv $REPORTS/prefix_sums-triton-report.csv --benchmark prefix_sums --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

      - name: Run micro benchmark
        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }}
        run: |
          cd benchmarks/micro_benchmarks
          python run_benchmarks.py --reports $REPORTS

      - name: Save pip cache
        if: ${{ steps.pip-cache.outputs.status == 'miss' }}
        uses: ./.github/actions/save
        with:
          path: ${{ steps.pip-cache.outputs.path }}
          dest: ${{ steps.pip-cache.outputs.dest }}

      - name: Upload benchmark reports
        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
        uses: actions/upload-artifact@v4
        with:
          name: benchmark-reports
          path: reports