diff --git a/.github/workflows/benchmark_v2.yml b/.github/workflows/benchmark_v2.yml new file mode 100644 index 000000000000..350ad0144101 --- /dev/null +++ b/.github/workflows/benchmark_v2.yml @@ -0,0 +1,82 @@ +name: Benchmark v2 Framework + +on: + workflow_call: + inputs: + runner: + description: 'GH Actions runner group to use' + required: true + type: string + commit_sha: + description: 'Commit SHA to benchmark' + required: false + type: string + default: '' + upload_to_hub: + description: 'Uploading results to a HuggingFace Dataset' + required: false + type: string + default: 'false' + run_id: + description: 'Custom run ID for organizing results (auto-generated if not provided)' + required: false + type: string + default: '' + benchmark_repo_id: + description: 'HuggingFace Dataset to upload results to (e.g., "org/benchmark-results")' + required: false + type: string + default: '' + +env: + HF_HOME: /mnt/cache + TRANSFORMERS_IS_CI: yes + # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. + # This token is created under the bot `hf-transformers-bot`. + HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + +jobs: + benchmark-v2: + name: Benchmark v2 + runs-on: ${{ inputs.runner }} + if: | + (github.event_name == 'pull_request' && contains( github.event.pull_request.labels.*.name, 'run-benchmark')) || + (github.event_name == 'schedule') + container: + image: huggingface/transformers-pytorch-gpu + options: --gpus all --privileged --ipc host --shm-size "16gb" + steps: + - name: Get repo + uses: actions/checkout@v4 + with: + ref: ${{ inputs.commit_sha || github.sha }} + + - name: Install benchmark dependencies + run: | + python3 -m pip install -r benchmark_v2/requirements.txt + + - name: Reinstall transformers in edit mode + run: | + python3 -m pip uninstall -y transformers + python3 -m pip install -e ".[torch]" + + - name: Show installed libraries and their versions + run: | + python3 -m pip list + python3 -c "import torch; print(f'PyTorch version: {torch.__version__}')" + python3 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')" + python3 -c "import torch; print(f'CUDA device count: {torch.cuda.device_count()}')" || true + nvidia-smi || true + + - name: Run benchmark v2 + working-directory: benchmark_v2 + run: | + echo "Running benchmarks" + python3 run_benchmarks.py \ + --commit-id '${{ inputs.commit_sha || github.sha }}' \ + --upload-to-hub '${{ inputs.upload_to_hub || false}}' \ + --run-id '${{ inputs.run_id }}' \ + --benchmark-repo-id '${{ inputs.benchmark_repo_id}}' \ + --log-level INFO + env: + HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/benchmark_v2_a10_caller.yml b/.github/workflows/benchmark_v2_a10_caller.yml new file mode 100644 index 000000000000..30b5e8be78a5 --- /dev/null +++ b/.github/workflows/benchmark_v2_a10_caller.yml @@ -0,0 +1,20 @@ +name: Benchmark v2 Scheduled Runner - A10 Single-GPU + +on: + schedule: + # Run daily at 16:30 UTC + - cron: "30 16 * * *" + pull_request: + types: [ opened, labeled, reopened, synchronize ] + +jobs: + benchmark-v2-default: + name: Benchmark v2 - Default Models + uses: ./.github/workflows/benchmark_v2.yml + with: + runner: aws-g5-4xlarge-cache-use1-public-80 + commit_sha: ${{ github.sha }} + upload_to_hub: true + run_id: ${{ github.run_id }} + benchmark_repo_id: hf-internal-testing/transformers-daily-benchmarks + secrets: inherit \ No newline at end of file diff --git a/.github/workflows/benchmark_v2_mi325_caller.yml b/.github/workflows/benchmark_v2_mi325_caller.yml new file mode 100644 index 000000000000..95fbeb5e5f6a --- /dev/null +++ b/.github/workflows/benchmark_v2_mi325_caller.yml @@ -0,0 +1,20 @@ +name: Benchmark v2 Scheduled Runner - MI325 Single-GPU + +on: + schedule: + # Run daily at 16:30 UTC + - cron: "30 16 * * *" + pull_request: + types: [ opened, labeled, reopened, synchronize ] + +jobs: + benchmark-v2-default: + name: Benchmark v2 - Default Models + uses: ./.github/workflows/benchmark_v2.yml + with: + runner: amd-mi325-ci-1gpu + commit_sha: ${{ github.sha }} + upload_to_hub: true + run_id: ${{ github.run_id }} + benchmark_repo_id: hf-internal-testing/transformers-daily-benchmarks + secrets: inherit \ No newline at end of file diff --git a/benchmark_v2/README.md b/benchmark_v2/README.md index 9a0102b387fc..1d34de6408c7 100644 --- a/benchmark_v2/README.md +++ b/benchmark_v2/README.md @@ -21,6 +21,36 @@ python run_benchmarks.py \ --num-tokens-to-generate 200 ``` +### Uploading Results to HuggingFace Dataset + +You can automatically upload benchmark results to a HuggingFace Dataset for tracking and analysis: + +```bash +# Upload to a public dataset with auto-generated run ID +python run_benchmarks.py --upload-to-hf username/benchmark-results + +# Upload with a custom run ID for easy identification +python run_benchmarks.py --upload-to-hf username/benchmark-results --run-id experiment_v1 +``` + +**Dataset Directory Structure:** +``` +dataset_name/ +├── 2025-01-15/ +│ ├── runs/ # Non-scheduled runs (manual, PR, etc.) +│ │ └── 123-1245151651/ # GitHub run number and ID +│ │ └── benchmark_results/ +│ │ ├── benchmark_summary_20250115_143022.json +│ │ └── model-name/ +│ │ └── model-name_benchmark_20250115_143022.json +│ └── benchmark_results_abc123de/ # Scheduled runs (daily CI) +│ ├── benchmark_summary_20250115_143022.json +│ └── model-name/ +│ └── model-name_benchmark_20250115_143022.json +└── 2025-01-16/ + └── ... +``` + ### Running Specific Benchmarks ```bash diff --git a/benchmark_v2/benches/llama.py b/benchmark_v2/benches/llama.py index 23427a8549c7..2349e75f1347 100644 --- a/benchmark_v2/benches/llama.py +++ b/benchmark_v2/benches/llama.py @@ -20,7 +20,6 @@ from benchmark_framework import ModelBenchmark -os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" os.environ["TOKENIZERS_PARALLELISM"] = "1" torch.set_float32_matmul_precision("high") diff --git a/benchmark_v2/requirements.txt b/benchmark_v2/requirements.txt index a7a435958cf7..e4dcbb3eb7ef 100644 --- a/benchmark_v2/requirements.txt +++ b/benchmark_v2/requirements.txt @@ -3,4 +3,5 @@ psutil>=5.8.0 gpustat>=1.0.0 torch>=2.0.0 transformers>=4.30.0 -datasets>=2.10.0 \ No newline at end of file +datasets>=2.10.0 +huggingface_hub>=0.16.0 \ No newline at end of file diff --git a/benchmark_v2/run_benchmarks.py b/benchmark_v2/run_benchmarks.py index 26c816b9d16d..44f6515a2c30 100755 --- a/benchmark_v2/run_benchmarks.py +++ b/benchmark_v2/run_benchmarks.py @@ -24,6 +24,7 @@ import logging import os import sys +import uuid from datetime import datetime from pathlib import Path from typing import Any, Optional @@ -160,7 +161,12 @@ def run_single_benchmark( return None -def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any], logger: logging.Logger) -> str: +def generate_summary_report( + output_dir: str, + benchmark_results: dict[str, Any], + logger: logging.Logger, + benchmark_run_uuid: Optional[str] = None, +) -> str: """Generate a summary report of all benchmark runs.""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") summary_file = os.path.join(output_dir, f"benchmark_summary_{timestamp}.json") @@ -168,6 +174,7 @@ def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any], summary_data = { "run_metadata": { "timestamp": datetime.utcnow().isoformat(), + "benchmark_run_uuid": benchmark_run_uuid, "total_benchmarks": len(benchmark_results), "successful_benchmarks": len([r for r in benchmark_results.values() if r is not None]), "failed_benchmarks": len([r for r in benchmark_results.values() if r is None]), @@ -183,9 +190,115 @@ def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any], return summary_file +def upload_results_to_hf_dataset( + output_dir: str, + summary_file: str, + dataset_name: str, + run_id: Optional[str] = None, + logger: Optional[logging.Logger] = None, +) -> Optional[str]: + """ + Upload benchmark results to a HuggingFace Dataset. + Based on upload_collated_report() from utils/collated_reports.py + Args: + output_dir: Local output directory containing results + summary_file: Path to the summary file + dataset_name: Name of the HuggingFace dataset to upload to + run_id: Unique run identifier (if None, will generate one) + logger: Logger instance + Returns: + The run_id used for the upload, None if upload failed + """ + if logger is None: + logger = logging.getLogger(__name__) + + import os + + from huggingface_hub import HfApi + + api = HfApi() + + if run_id is None: + github_run_number = os.getenv("GITHUB_RUN_NUMBER") + github_run_id = os.getenv("GITHUB_RUN_ID") + if github_run_number and github_run_id: + run_id = f"{github_run_number}-{github_run_id}" + + date_folder = datetime.now().strftime("%Y-%m-%d") + + github_event_name = os.getenv("GITHUB_EVENT_NAME") + if github_event_name != "schedule": + # Non-scheduled runs go under a runs subfolder + repo_path = f"{date_folder}/runs/{run_id}/benchmark_results" + else: + # Scheduled runs go directly under the date + repo_path = f"{date_folder}/{run_id}/benchmark_results" + + logger.info(f"Uploading benchmark results to dataset '{dataset_name}' at path '{repo_path}'") + + try: + # Get the authentication token (prioritize specific token, fallback to HF_TOKEN) + token = os.getenv("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN") or os.getenv("HF_TOKEN") + + # Upload all files in the output directory + from pathlib import Path + + output_path = Path(output_dir) + + for file_path in output_path.rglob("*"): + if file_path.is_file(): + # Calculate relative path from output_dir + relative_path = file_path.relative_to(output_path) + path_in_repo = f"{repo_path}/{relative_path}" + + logger.debug(f"Uploading {file_path} to {path_in_repo}") + + api.upload_file( + path_or_fileobj=str(file_path), + path_in_repo=path_in_repo, + repo_id=dataset_name, + repo_type="dataset", + token=token, + commit_message=f"Upload benchmark results for run {run_id}", + ) + + logger.info( + f"Successfully uploaded results to: https://huggingface.co/datasets/{dataset_name}/tree/main/{repo_path}" + ) + + return run_id + + except Exception as upload_error: + logger.error(f"Failed to upload results: {upload_error}") + import traceback + + logger.debug(traceback.format_exc()) + return None + + def main(): """Main entry point for the benchmarking script.""" - parser = argparse.ArgumentParser(description="Run all benchmarks in the ./benches directory") + # Generate a unique UUID for this benchmark run + benchmark_run_uuid = str(uuid.uuid4())[:8] + + parser = argparse.ArgumentParser( + description="Run all benchmarks in the ./benches directory", + epilog=""" +Examples: + # Run all available benchmarks + python3 run_benchmarks.py + + # Run with specific model and upload to HuggingFace Dataset + python3 run_benchmarks.py --model-id meta-llama/Llama-2-7b-hf --upload-to-hf username/benchmark-results + + # Run with custom run ID and upload to HuggingFace Dataset + python3 run_benchmarks.py --run-id experiment_v1 --upload-to-hf org/benchmarks + + # Run only specific benchmarks with file logging + python3 run_benchmarks.py --include llama --enable-file-logging + """, # noqa: W293 + formatter_class=argparse.RawDescriptionHelpFormatter, + ) parser.add_argument( "--output-dir", @@ -228,20 +341,29 @@ def main(): parser.add_argument("--exclude", type=str, nargs="*", help="Exclude benchmarks matching these names") - parser.add_argument("--enable-mock", action="store_true", help="Enable mock benchmark (skipped by default)") - parser.add_argument("--enable-file-logging", action="store_true", help="Enable file logging (disabled by default)") parser.add_argument( "--commit-id", type=str, help="Git commit ID for metadata (if not provided, will auto-detect from git)" ) + parser.add_argument( + "--upload-to-hub", + type=str, + help="Upload results to HuggingFace Dataset (provide dataset name, e.g., 'username/benchmark-results')", + ) + + parser.add_argument( + "--run-id", type=str, help="Custom run ID for organizing results (if not provided, will generate a unique ID)" + ) + args = parser.parse_args() # Setup logging logger = setup_logging(args.log_level, args.enable_file_logging) logger.info("Starting benchmark discovery and execution") + logger.info(f"Benchmark run UUID: {benchmark_run_uuid}") logger.info(f"Output directory: {args.output_dir}") logger.info(f"Benches directory: {args.benches_dir}") @@ -286,9 +408,6 @@ def main(): if args.model_id: benchmark_kwargs["model_id"] = args.model_id - # Add enable_mock flag for mock benchmark - benchmark_kwargs["enable_mock"] = args.enable_mock - # Add commit_id if provided if args.commit_id: benchmark_kwargs["commit_id"] = args.commit_id @@ -306,7 +425,27 @@ def main(): successful_count += 1 # Generate summary report - summary_file = generate_summary_report(args.output_dir, benchmark_results, logger) + summary_file = generate_summary_report(args.output_dir, benchmark_results, logger, benchmark_run_uuid) + + # Upload results to HuggingFace Dataset if requested + upload_run_id = None + if args.upload_to_hub: + logger.info("=" * 60) + logger.info("UPLOADING TO HUGGINGFACE DATASET") + logger.info("=" * 60) + # Use provided run_id or fallback to benchmark run UUID + effective_run_id = args.run_id or benchmark_run_uuid + upload_run_id = upload_results_to_hf_dataset( + output_dir=args.output_dir, + summary_file=summary_file, + dataset_name=args.upload_to_hub, + run_id=effective_run_id, + logger=logger, + ) + if upload_run_id: + logger.info(f"Upload completed with run ID: {upload_run_id}") + else: + logger.warning("Upload failed - continuing with local results") # Final summary total_benchmarks = len(filtered_benchmarks) @@ -321,6 +460,16 @@ def main(): logger.info(f"Output directory: {args.output_dir}") logger.info(f"Summary report: {summary_file}") + if args.upload_to_hub: + if upload_run_id: + logger.info(f"HuggingFace Dataset: {args.upload_to_hub}") + logger.info(f"Run ID: {upload_run_id}") + logger.info( + f"View results: https://huggingface.co/datasets/{args.upload_to_hub}/tree/main/{datetime.now().strftime('%Y-%m-%d')}/runs/{upload_run_id}" + ) + else: + logger.warning("Upload to HuggingFace Dataset failed") + if failed_count > 0: logger.warning(f"{failed_count} benchmark(s) failed. Check logs for details.") return 1