pytorch · msaroufim · May 19, 2022 · May 19, 2022 · May 19, 2022 · May 19, 2022
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -0,0 +1,34 @@
+# Install dependencies
+
+```
+pip install -r benchmarks/requirements.txt
+python setup.py develop
+```
+
+# Usage instructions
+
+```
+usage: run_benchmark.py [-h] [--dataset DATASET] [--model_name MODEL_NAME] [--batch_size BATCH_SIZE] [--device DEVICE] [--num_epochs NUM_EPOCHS]
+                        [--report_location REPORT_LOCATION] [--num_workers NUM_WORKERS] [--shuffle] [--dataloaderv DATALOADERV]
+```
+
+## Available metrics
+
+- [x] Total time
+- [x] Time per batch
+- [x] Time per epoch
+- [x] Precision over time
+- [x] CPU Load
+- [x] GPU Load
+- [x] Memory usage
+
+## Additional profiling
+
+The PyTorch profiler doesn't work quite well with `torchdata` for now https://github.com/pytorch/kineto/issues/609 but
+there are other good options like `py-spy` or `scalene` which could be used like so `profiler_name run_benchmark.py`
+
+## Other benchmarks in the wild
+
+- https://github.com/pytorch/kineto/blob/main/tb_plugin/examples/datapipe_example.py
+- https://github.com/pytorch/text/tree/main/test/datasets
+- https://github.com/pytorch/vision/tree/main/torchvision/prototype/datasets
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
diff --git a/benchmarks/args.py b/benchmarks/args.py
@@ -0,0 +1,30 @@
+from dataclasses import dataclass, fields
+from enum import Enum
+
+from simple_parsing import ArgumentParser
+
+
+@dataclass(frozen=True)
+class BenchmarkConfig:
+    dataset: str = "gtsrb"  # TODO: Integrate with HF datasets
+    model_name: str = "resnext50_32x4d"  # TODO: torchvision models supported only
+    batch_size: int = 1
+    device: str = "cuda:0"  # Options are cpu or cuda:0
+    num_epochs: int = 1
+    report_location: str = "report.csv"
+    num_wokers: int = 1
+    shuffle: bool = True
+    dataloader_version: int = 1  # Options are 1 or 2
+
+
+## Arg parsing
+def arg_parser():
+    parser = ArgumentParser()
+    parser.add_arguments(BenchmarkConfig, dest="options")
+    args = parser.parse_args()
+    benchmark_config = args.options
+    return benchmark_config
+
+
+if __name__ == "__main__":
+    arg_parser()
diff --git a/benchmarks/cloud/README.md b/benchmarks/cloud/README.md
@@ -0,0 +1,34 @@
+This folder contains templates that are useful for cloud setups
+
+Idea would be to provision a machine by configuring it in a YAML file and then running a benchmark script on it automatically. This is critical both for ad hoc benchmarking that are reproducible but also including real world benchmarks in a release.
+
+We've provided some useful `yml` templates for you to get started
+
+https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/using-cfn-cli-creating-stack.html
+
+## Setup aws cli
+`aws configure` and enter your credentials
+
+## Setup stack (machine configuration)
+
+```sh
+ aws cloudformation create-stack \
+  --stack-name torchdatabenchmark \
+  --template-body ec2.yml \
+  --parameters ParameterKey=InstanceTypeParameter,ParameterValue=p3.2xlarge ParameterKey=DiskType,ParameterValue=gp3
+```
+
+## Ssh into machine and run job 
+```
+ssh elastic_ip
+git clone https://github.com/pytorch/data
+cd data/benchmarks
+python run_benchmark.py
+```
+
+Visually inspect logs
+
+## Shut down stack
+
+`aws cloudformation delete-stack --stack-name myteststack`
+
diff --git a/benchmarks/cloud/ec2.yml b/benchmarks/cloud/ec2.yml
@@ -0,0 +1,84 @@
+# This script sets up an Ec2 instance with elastic IP and a disk volume
+Parameters:
+  InstanceTypeParameter:
+    Type: String
+    Default: c5n.large
+    AllowedValues:
+      - c5n.large
+      - p2.2xlarge
+      - p3.2xlarge
+      - p3.8xlarge
+    Description: Instance type CPU, GPU
+  DiskSize:
+    Type: Number
+    Default: 100
+    Description: Disk size in GB
+  DiskType:
+    Type: String
+    Default: gp2
+    AllowedValues:
+      - gp2
+      - gp3
+      - io1
+      - io2
+      - sc1
+      - st1
+      - standard
+    Description: Enter Disk type SSD, HDD
+
+Resources:
+  MyInstance:
+    Type: AWS::EC2::Instance
+    Properties:
+      AvailabilityZone: us-west-2a
+      ImageId: ami-0306d46d05aaf8663 # Deep Learning AMI
+      InstanceType: 
+        Ref: InstanceTypeParameter
+      SecurityGroups:
+        - !Ref SSHSecurityGroup
+
+  # Elastic IP so I can easily ssh into the machine
+  MyEIP:
+    Type: AWS::EC2::EIP
+    Properties:
+      InstanceId: !Ref MyInstance
+
+  # Open security group for SSH
+  SSHSecurityGroup:
+    Type: AWS::EC2::SecurityGroup
+    Properties:
+      GroupDescription: Enable SSH access via port 22
+      SecurityGroupIngress:
+      - CidrIp: 0.0.0.0/0
+        FromPort: 22
+        IpProtocol: tcp
+        ToPort: 22
+
+
+  NewVolume:
+    Type: AWS::EC2::Volume
+    Properties:
+      Size: 
+        Ref: DiskSize
+      VolumeType:
+        Ref: DiskType
+      AvailabilityZone: !GetAtt MyInstance.AvailabilityZone
+      Tags:
+        - Key: MyTag
+          Value: TagValue
+    DeletionPolicy: Snapshot
+
+  MountPoint:
+    Type: AWS::EC2::VolumeAttachment
+    Properties:
+      InstanceId: !Ref MyInstance
+      VolumeId: !Ref NewVolume
+      Device: /dev/sdh
+
+# # Volume
+# SSD:
+#   Type: AWS::EC2::VolumeAttachment
+#   Properties:
+#     InstanceId: !Ref MyInstance
+
+# HDD:
diff --git a/benchmarks/datasets.py b/benchmarks/datasets.py
@@ -0,0 +1,40 @@
+from torchvision import transforms, datasets
+import torch
+from abc import ABC, abstractmethod
+
+class DataPipeReadyBenchmark(ABC):
+    @abstractmethod
+    def prepare_pipe(self, params):
+        return NotImplementedError
+
+class GTSRBReadyBenchmark(DataPipeReadyBenchmark):
+    def transform(img):
+        t= transforms.Compose([
+            transforms.ToPILImage(),
+            transforms.Resize(size=(100,100)),
+            transforms.ToTensor()]
+        )
+        return t(img)
+
+    def str_to_list(str):
+        l = []
+        for char in str:
+            l.append(int(char))
+        return l
+
+    def prepare_pipe(self, params):
+        batch_size, device, dp = params
+        # Filter out bounding box and path to image
+        dp = dp.map(lambda sample : {"image" : sample["image"], "label" : sample["label"]})
+
+        # Apply image preprocessing
+        dp = dp.map(lambda sample : transform(sample.decode().to(torch.device(device))), input_col="image")
+        dp = dp.map(lambda sample : torch.tensor(str_to_list(sample.to_categories())).to(torch.device(device)), input_col="label")
+
+        # Batch
+        dp = dp.batch(batch_size)
+        return dp    
+
+class HuggingFaceReadyBenchmark(DataPipeReadyBenchmark):
+    def prepare(self, dataset_name):
+        return NotImplementedError
diff --git a/benchmarks/report.py b/benchmarks/report.py
@@ -0,0 +1,52 @@
+import csv
+from abc import ABC, abstractclassmethod
+from dataclasses import dataclass, fields
+from statistics import mean
+from typing import Dict, list, tuple
+
+import numpy as np
+
+duration = int
+
+
+@dataclass
+class MetricCache:
+    epoch_durations: list[duration]
+    batch_durations: list[duration]
+    total_duration: int = 0
+
+
+class MetricExporter(ABC):
+    @abstractclassmethod
+    def export(self, metric_cache: MetricCache) -> None:
+        return NotImplementedError
+
+    def calculate_percentiles(self, metric_cache: MetricCache) -> Dict[str, float]:
+        output = {}
+        for field in fields(metric_cache):
+            duration_list = getattr(metric_cache, field.name)
+            percentiles = [
+                np.percentile(duration_list, 0.5),
+                np.percentile(duration_list, 0.9),
+                np.percentile(duration_list, 0.99),
+            ]
+            output[field.name] = percentiles
+        return output
+
+
+class StdOutReport(MetricExporter):
+    @staticmethod
+    def export(self, metric_cache):
+        percentiles_dict = metric_cache.calculate_percentiles()
+        for field, percentiles in percentiles_dict.items:
+            print(f"{field} duration is {percentiles}")
+
+
+class CSVReport(MetricExporter):
+    @staticmethod
+    def export(self, metric_cache: MetricCache, filepath: str):
+        percentiles_dict = metric_cache.calculate_percentiles()
+        with open(filepath, "w") as file:
+            writer = csv.writer(file)
+            for field, percentiles in percentiles_dict.items:
+                writer.writerow(field + percentiles)
diff --git a/benchmarks/requirements.txt b/benchmarks/requirements.txt
@@ -0,0 +1,9 @@
+--extra-index-url https://download.pytorch.org/whl/nightly/cu113
+simple-parsing
+dill
+numpy
+torch 
+torchvision
+torchaudio
+torchtext 
+transformers