CentML · yaxan · Oct 28, 2024 · Sep 23, 2024 · Sep 23, 2024 · Sep 23, 2024
diff --git a/centml/compiler/config.py b/centml/compiler/config.py
@@ -38,7 +38,7 @@ class Config(BaseSettings):
 
     CENTML_MODE: OperationMode = OperationMode.REMOTE_COMPILATION
     CENTML_PREDICTION_DATA_FILE: str = 'tests/sample_data.csv'
-    CENTML_PREDICTION_GPUS: str = "A10G,A100SXM440GB"
+    CENTML_PREDICTION_GPUS: str = "A10G,A100SXM440GB,L4,H10080GBHBM3"
     CENTML_PROMETHEUS_PORT: int = 8000
 
 

diff --git a/centml/compiler/prediction/profiler.py b/centml/compiler/prediction/profiler.py
@@ -4,6 +4,8 @@
 import torch.fx
 from torch.fx.node import Node
 
+from scripts.timer import timed
+
 
 class Profiler:
     def __init__(self, mod, gpu, treeDB, data_collection_mode=False):
@@ -13,11 +15,30 @@ def __init__(self, mod, gpu, treeDB, data_collection_mode=False):
         self.tree_db = treeDB
         self.gpu = gpu
         self.data_collection_mode = data_collection_mode
+        self.trace_event_idx = 0
 
     def propagate(self, *args):
         args_iter = iter(args)
         env: Dict[str, Node] = {}
-        total_time = 0
+        total_gpu_time = 0
+        actual_time = 0
+        trace_events = []
+        if self.data_collection_mode:
+            # Warmup before profiling
+            for _ in range(10):
+                _, t = timed(lambda: self.mod(*args))
+
+            # actual_time is to compare prediction to execution time of GraphModule
+            actual_time = t
+
+            with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as prof:
+                self.mod(*args)
+            for event in prof.events():
+                # Ignore CPU events for now
+                if event.trace_name is None or event.device_type == torch.autograd.DeviceType.CPU:
+                    continue
+                # Create a mapping of kernel execution times to the corresponding trace events
+                trace_events.append(event.time_range.elapsed_us())
 
         def load_arg(a):
             return torch.fx.graph.map_arg(a, lambda n: env[n.name])
@@ -81,14 +102,26 @@ def find_dtypes(results):
         def get_time_or_profile(key, inp_shapes, operation, *args, **kwargs):
             t = self.tree_db.get(key, inp_shapes)
 
-            if self.data_collection_mode and t is None:
+            if self.data_collection_mode:
                 with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as prof:
                     operation(*args, **kwargs)
-                event_time_total = 0
-                for event in prof.key_averages():
-                    event_time_total += event.cuda_time_total
-                t = event_time_total
-                self.tree_db.add(key, inp_shapes, t)
+
+                if t is None:
+                    # New key
+                    event_time_total = 0
+                    for event in prof.events():
+                        if event.trace_name is None or event.device_type == torch.autograd.DeviceType.CPU:
+                            continue
+                        event_time_total += trace_events[self.trace_event_idx]
+                        self.trace_event_idx += 1
+                    t = event_time_total
+                    self.tree_db.add(key, inp_shapes, t)
+                else:
+                    # Existing key, increment trace_event_idx by # of events to maintain mapping to trace_events list
+                    for event in prof.events():
+                        if event.trace_name is None or event.device_type == torch.autograd.DeviceType.CPU:
+                            continue
+                        self.trace_event_idx += 1
 
             return t
 
@@ -110,7 +143,7 @@ def get_time_or_profile(key, inp_shapes, operation, *args, **kwargs):
 
                 t = get_time_or_profile(key, inp_shapes, node.target, *args, **kwargs)
 
-                total_time += t
+                total_gpu_time += t
             elif node.op == 'call_method':
                 self_obj, *args = load_arg(node.args)
                 kwargs = load_arg(node.kwargs)
@@ -123,7 +156,7 @@ def get_time_or_profile(key, inp_shapes, operation, *args, **kwargs):
 
                 t = get_time_or_profile(key, inp_shapes, getattr(self_obj, node.target), *args, **kwargs)
 
-                total_time += t
+                total_gpu_time += t
             elif node.op == 'call_module':
                 mod = self.modules[node.target]
                 args = load_arg(node.args)
@@ -145,9 +178,12 @@ def get_time_or_profile(key, inp_shapes, operation, *args, **kwargs):
 
                 t = get_time_or_profile(key, inp_shapes, mod, *args, **kwargs)
 
-                total_time += t
+                total_gpu_time += t
             elif node.op == 'output':
                 args = load_arg(node.args)
-                return args[0], total_time
+                if self.data_collection_mode:
+                    # Return full graph execution time as well for accuracy comparison
+                    return args[0], total_gpu_time, actual_time
+                return args[0], total_gpu_time
 
             env[node.name] = result
diff --git a/scripts/data_collection.py b/scripts/data_collection.py
@@ -1,30 +1,20 @@
-import argparse
 import csv
 import gc
 import json
-import os
-import random
-import statistics
-import time
 
-import numpy as np
 import torch
-import torchvision.models as models
-from sklearn.neighbors import KDTree
-from torch.profiler import ProfilerActivity, profile, record_function
 from transformers import (
-    AutoConfig,
     AutoModelForCausalLM,
     AutoTokenizer,
-    BertConfig,
-    BertForMaskedLM,
-    GPT2ForSequenceClassification,
-    PegasusConfig,
-    PegasusForCausalLM,
+    AutoModelForImageClassification,
+    AutoModelForObjectDetection
 )
 
+
+
 from centml.compiler.prediction.kdtree import KDTreeWithValues
 from centml.compiler.prediction.profiler import Profiler
+from scripts.timer import timed
 
 torch.set_float32_matmul_precision('high')
 torch.set_default_device('cuda')
@@ -34,32 +24,74 @@
 OUTPUT_FILE = 'data.csv'
 
 # Different HuggingFace Models + Different Input Sizes
-hf_model_tests = [
-    ("EleutherAI/gpt-neo-2.7B", (1, 512)),
+llm_tests = [
+    ("google/gemma-7b", (1, 128)),
+    ("microsoft/phi-2", (1,512)),
+    ("microsoft/phi-2", (2,512)),
+    ("facebook/bart-large", (1, 1024)),
+    ("facebook/bart-large", (2, 512)),
     ("gpt2-xl", (1, 1024)),
-    ("gpt2-large", (1, 1024)),
+    ("gpt2-xl", (1, 720)),
     ("gpt2-xl", (1, 512)),
+    ("gpt2-xl", (2, 512)),
+    ("gpt2-xl", (4, 256)),
+    ("EleutherAI/gpt-neo-2.7B", (1, 512)),
+    ("EleutherAI/gpt-neo-2.7B", (1, 256)),
+    ("gpt2-large", (1, 1024)),
+    ("gpt2-large", (1, 720)),
+    ("gpt2-large", (1, 512)),
     ("google-bert/bert-large-uncased", (8, 512)),
     ("google-bert/bert-large-uncased", (16, 512)),
-    ("meta-llama/Meta-Llama-3.1-8B", (1, 512)),
     ("meta-llama/Meta-Llama-3.1-8B", (1, 256)),
     ("gpt2-medium", (1, 1024)),
-    ("facebook/bart-large", (1, 1024)),
+    ("gpt2-medium", (1, 512)),
+    ("gpt2-medium", (2, 512)),
     ("google/pegasus-cnn_dailymail", (1, 1024)),
+    ("google/pegasus-cnn_dailymail", (1, 512)),
+    ("google/pegasus-cnn_dailymail", (2, 512)),
 ]
 
-# Different Batch Sizes for each ResNet Model (torchvision)
-resnet_tests = [1024, 720, 1440]
-
+# Tests for larger GPUs (A100, H100, etc.)
+# large_llm_tests = [
+#     ("google/gemma-7b", (1, 256)),
+#     ("google/gemma-7b", (1, 512)),
+#     ("google/gemma-7b", (1, 1024)),
+#     ("microsoft/phi-2", (1,1024)),
+#     ("microsoft/phi-2", (1,2048)),
+#     ("microsoft/phi-2", (2,1024)),
+#     ("EleutherAI/gpt-neo-2.7B", (1, 1024)),
+#     ("gpt2-xl", (2, 1024)),
+#     ("gpt2-xl", (4, 512)),
+#     ("meta-llama/Meta-Llama-3.1-8B", (1, 1024)),
+#     ("meta-llama/Meta-Llama-3.1-8B", (1, 512)),
+#     ("google/pegasus-cnn_dailymail", (4, 1024)),
+#     ("facebook/bart-large", (4, 1024)),
+#     ("facebook/bart-large", (2, 1024)),
+#     ("google-bert/bert-large-uncased", (16, 512)),
+#     ("gpt2-medium", (2, 1024)),
+#     ("gpt2-medium", (4, 512)),
+#     ("gpt2-large", (2, 1024)),
+#     ("gpt2-large", (4, 512)),
+# ]
+
+# Different Batch Sizes for each image classification model
+image_classification_tests = [
+    ("google/efficientnet-b0", 512),
+    ("google/efficientnet-b0", 256),
+    ("google/efficientnet-b0", 128),
+    ("google/vit-base-patch16-224", 128),
+    ("microsoft/resnet-50", 256),
+    ("microsoft/resnet-50", 512),
+]
 
-def timed(fn):
-    start = torch.cuda.Event(enable_timing=True)
-    end = torch.cuda.Event(enable_timing=True)
-    start.record()
-    result = fn()
-    end.record()
-    torch.cuda.synchronize()
-    return result, start.elapsed_time(end) / 1000
+# Different Batch Sizes for each object detection model
+object_detection_tests = [
+    ("hustvl/yolos-tiny", 128),
+    ("hustvl/yolos-tiny", 256),
+    ("hustvl/yolos-tiny", 512),
+    ("facebook/detr-resnet-50", 128),
+    ("facebook/detr-resnet-50", 256),
+]
 
 
 def percent_error(observed, true):
@@ -90,24 +122,28 @@ def get(self, key, inp):
 
 
 db = DataCollectionTreeDB()
-added_time = 0
+cuda_kernel_time = 0
+actual_time = 0
 
 
 def custom_backend(gm: torch.fx.GraphModule, inps):
     print("Compiling")
     profiler = Profiler(mod=gm, gpu=CURR_GPU, treeDB=db, data_collection_mode=True)
 
     def forward(*args):
-        global added_time
-        out, t = profiler.propagate(*args)
-        added_time += t
+        global cuda_kernel_time
+        global actual_time
+        out, t, actual_t = profiler.propagate(*args)
+        cuda_kernel_time += t
+        actual_time += actual_t
         return out
 
     return forward
 
 
-def hf_model_test(model_name, input_size, custom_backend):
-    global added_time
+def llm_test(model_name, input_size, custom_backend):
+    global cuda_kernel_time
+    global actual_time
     models_without_tokenizer = {"google/pegasus-cnn_dailymail"}
 
     model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda:0")
@@ -131,22 +167,55 @@ def hf_model_test(model_name, input_size, custom_backend):
     compiled_model = torch.compile(model, backend=custom_backend)
     compiled_model(inp)
 
-    added_time /= 1000000
+    cuda_kernel_time /= 1000000
 
     print(f"{model_name}, {input_size}")
-    print("Real time: ", t)
-    print("TOTAL TIME: ", added_time)
-    print("Error: ", percent_error(added_time, t))
+    print("Real time: ", actual_time)
+    print("Kernel execution time: ", cuda_kernel_time)
+    print("Error: ", percent_error(cuda_kernel_time, actual_time))
 
-    added_time = 0
+    cuda_kernel_time = 0
+    actual_time = 0
     del model, inp, compiled_model
     gc.collect()
     torch.cuda.empty_cache()
 
 
-def resnet_test(batch_size, custom_backend):
-    global added_time
-    model = models.resnet50(weights=True, num_classes=1000).cuda()
+def image_classification_test(model_name, batch_size, custom_backend):
+    global cuda_kernel_time
+    global actual_time
+    model = AutoModelForImageClassification.from_pretrained(model_name).to("cuda:0")
+    model.eval()
+    if model_name == "google/vit-base-patch16-224":
+        inp = torch.randn(batch_size, 3, 224, 224).cuda(0)
+    else:
+        inp = torch.randn(batch_size, 3, 128, 128).cuda(0)
+
+    with torch.inference_mode():
+        for _ in range(10):
+            _, t = timed(lambda: model(inp))
+            print(t)
+
+    compiled_model = torch.compile(model, backend=custom_backend)
+    compiled_model(inp)
+
+    cuda_kernel_time /= 1000000
+
+    print(f"{model_name}, {batch_size}")
+    print("Real time: ", actual_time)
+    print("TOTAL TIME: ", cuda_kernel_time)
+    print("Error: ", percent_error(cuda_kernel_time, actual_time))
+
+    cuda_kernel_time = 0
+    actual_time = 0
+    del model, inp, compiled_model
+    gc.collect()
+    torch.cuda.empty_cache()
+
+def object_detection_test(model_name, batch_size, custom_backend):
+    global cuda_kernel_time
+    global actual_time
+    model = AutoModelForObjectDetection.from_pretrained(model_name).to("cuda:0")
     model.eval()
     inp = torch.randn(batch_size, 3, 128, 128).cuda(0)
 
@@ -157,22 +226,31 @@ def resnet_test(batch_size, custom_backend):
 
     compiled_model = torch.compile(model, backend=custom_backend)
     compiled_model(inp)
-    print(f"resnet, ({batch_size}, 3, 128, 128)")
-    print("Real time: ", t)
-    print("TOTAL TIME: ", added_time)
-    print("Error: ", percent_error(added_time, t))
 
-    added_time = 0
+    cuda_kernel_time /= 1000000
+
+    print(f"{model_name}, {batch_size}")
+    print("Real time: ", actual_time)
+    print("TOTAL TIME: ", cuda_kernel_time)
+    print("Error: ", percent_error(cuda_kernel_time, actual_time))
+
+    cuda_kernel_time = 0
+    actual_time = 0
     del model, inp, compiled_model
     gc.collect()
     torch.cuda.empty_cache()
 
+# for model_name, input_size in large_llm_tests:
+#     llm_test(model_name, input_size, custom_backend)
+
+for model_name, input_size in llm_tests:
+    llm_test(model_name, input_size, custom_backend)
 
-for model_name, input_size in hf_model_tests:
-    hf_model_test(model_name, input_size, custom_backend)
+for model_name, batch_size in object_detection_tests:
+    object_detection_test(model_name, batch_size, custom_backend)
 
-for batch_size in resnet_tests:
-    resnet_test(batch_size, custom_backend)
+for model_name, batch_size in image_classification_tests:
+    image_classification_test(model_name, batch_size, custom_backend)
 
 # Write to CSV
 with open(OUTPUT_FILE, 'w', newline='') as csvfile:

diff --git a/scripts/timer.py b/scripts/timer.py
@@ -0,0 +1,10 @@
+import torch
+
+def timed(fn):
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    result = fn()
+    end.record()
+    torch.cuda.synchronize()
+    return result, start.elapsed_time(end) / 1000