intel
diff --git a/‎.github/workflows/execute-test-script.yml‎
Lines changed: 9 additions & 1 deletion b/‎.github/workflows/execute-test-script.yml‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎.github/workflows/test-single-config.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/test-single-config.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 19 additions & 14 deletions b/‎.github/workflows/test.yml‎
Lines changed: 19 additions & 14 deletions
diff --git a/‎.github/workflows/test_amd.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/test_amd.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/test_nvidia.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/test_nvidia.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎db_tools/create_view.sql‎
Lines changed: 15 additions & 2 deletions b/‎db_tools/create_view.sql‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎dl_bench/llm.py‎
Lines changed: 70 additions & 35 deletions b/‎dl_bench/llm.py‎
Lines changed: 70 additions & 35 deletions
diff --git a/‎dl_bench/prompt.json‎
Lines changed: 79 additions & 0 deletions b/‎dl_bench/prompt.json‎
Lines changed: 79 additions & 0 deletions
@@ -38,6 +38,9 @@ on:
         secrets:
             DB_URL:
                 required: true
+            HF_TOKEN:
+                required: true
+
 
 jobs:
     print_inputs:
@@ -117,6 +120,8 @@ jobs:
                   fi
 
                   export DL_BENCH_ARGS="--host ${{ inputs.runner_type }} --compiler ${{ inputs.compiler }} --device ${{ inputs.device }} --tag ${{ inputs.tag }} ${URL}"
+                  # We need token to load llama2 from huggingface repo, which is closed
+                  export HF_TOKEN="${{ secrets.HF_TOKEN }}"
 
                   # We mainly want to verify our own backend
                   if [[ ${{ inputs.compiler }} != *torch_mlir* ]]; then
@@ -126,10 +131,13 @@ jobs:
                   # HOST CONFIG
                   export KMP_AFFINITY=respect,noreset,granularity=fine,balanced
                   # This parameter is incredibly important once we use numactl to pick one socket, performance difference was 10x for resnet50 bs=1 torch-inductor
-                  export OMP_NUM_THREADS=32
+                  export OMP_NUM_THREADS=$(grep ^cpu\\scores /proc/cpuinfo | uniq | awk '{print $4}')
+                  echo "CPU cores configured: $OMP_NUM_THREADS"
                   if [[ ${LABELS} = *glados* ]]; then
+                      export HF_HOME="/cache/torchmlir/huggingface_cache"
                       numactl -N 1 ${{ inputs.test_script }}
                   else
+                      export HF_HOME="/data/torchmlir/huggingface_cache"
                       source ${{ inputs.test_script}}
                   fi
 
 
@@ -81,6 +81,7 @@ jobs:
             test_script: ${{ matrix.test_script }}
         secrets:
             DB_URL: ${{ secrets.DB_URL }}
+            HF_TOKEN: ${{ secrets.HF_TOKEN }}
 
     shutdown:
         needs: mlp_test
 
@@ -35,28 +35,33 @@ jobs:
     mlp_test:
         strategy:
             matrix:
-                type: [
-                  {device: 'cpu', compiler: 'torch'},
-                  {device: 'cpu', compiler: 'dynamo'},
-                  {device: 'cpu', compiler: 'torchscript'},
-                  {device: 'cpu', compiler: 'torchscript_onednn'},
-                  {device: 'cpu', compiler: 'ipex'},
-                  {device: 'cpu', compiler: 'ipex_onednn_graph'},
-#                  {device: 'xpu', compiler: 'ipex'},
-                  {device: 'cpu', compiler: 'torch_mlir'},
-                  {device: 'cpu', compiler: 'torch_mlir_xsmm'}
-                ]
+                compiler:
+                    - torch
+                    - dynamo
+                    - torchscript
+                    - torchscript_onednn
+                    - ipex
+                    - ipex_onednn_graph
+                    - torch_mlir
+                    - torch_mlir_xsmm
                 test_script: ${{ github.event_name == 'workflow_dispatch' && fromJson(inputs.test_scripts) || fromJson('["./mlp.sh", "./cnn.sh", "./llm.sh"]') }}
+                exclude:
+                    - test_script: "./llm.sh"
+                      compiler: torchscript
+                    - test_script: "./llm.sh"
+                      compiler: torchscript_onednn
+                    - test_script: "./llm.sh"
+                      compiler: ipex_onednn_graph
             fail-fast: false
         uses: ./.github/workflows/execute-test-script.yml
         with:
-            compiler: ${{ matrix.type.compiler }}
-            device: ${{ matrix.type.device }}
+            compiler: ${{ matrix.compiler }}
+            device: cpu
             tag: ${{ github.event_name == 'workflow_dispatch' && inputs.tag || 'ci' }}
             torch_mlir_repo: ${{ github.event_name == 'workflow_dispatch' && inputs.torch_mlir_repo || 'intel-ai/torch-mlir' }}
             torch_mlir_branch: ${{ github.event_name == 'workflow_dispatch' && inputs.torch_mlir_branch || 'cpu-proto' }}
             runner_type: spr
-            shutdown_cloud_runner: false
             test_script: ${{ matrix.test_script }}
         secrets:
             DB_URL: ${{ secrets.DB_URL }}
+            HF_TOKEN: ${{ secrets.HF_TOKEN }}
@@ -54,6 +54,7 @@ jobs:
             test_script: ${{ matrix.test_script }}
         secrets:
             DB_URL: ${{ secrets.DB_URL }}
+            HF_TOKEN: ${{ secrets.HF_TOKEN }}
 
     shutdown:
         needs: mlp_test
 
@@ -42,6 +42,7 @@ jobs:
             test_script: ${{ matrix.test_script }}
         secrets:
             DB_URL: ${{ secrets.DB_URL }}
+            HF_TOKEN: ${{ secrets.HF_TOKEN }}
 
     shutdown:
         runs-on: a100
 
@@ -1,11 +1,24 @@
 CREATE OR REPLACE VIEW torchmlir_benchmark_view AS
 SELECT
     id,
-    REPLACE(REPLACE(CONCAT(host, '-', compiler, '-', dtype, '-', tag), 'torchscript', 'ts'), '-ci', '') AS backend,
+    REPLACE(
+        REPLACE(
+        REPLACE(
+        REPLACE(
+        REPLACE(
+        REPLACE(
+            CONCAT(host, '-', compiler, '-', dtype, '-', tag),
+        'torchscript', 'ts'),
+        '-ci', ''),
+        'ts_onednn', 'onednn'),
+        'ipex_onednn_graph', 'ipex_gc'),
+        'bfloat16', 'b16'),
+        'float32', 'f32'
+    ) AS backend,
     host,
     device,
     compiler,
-    dtype,
+    REPLACE(REPLACE(dtype, 'bfloat16', 'b16'), 'float32', 'f32') AS dtype,
     tag,
     benchmark,
     benchmark_desc,
 
@@ -1,76 +1,111 @@
+import os
 import time
+import math
 
 import torch
 import intel_extension_for_pytorch as ipex
-from transformers import AutoModelForCausalLM, AutoTokenizer
+import numpy as np
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    LlamaForCausalLM,
+    LlamaTokenizer,
+)
 
-from dl_bench.utils import TimerManager, Benchmark, str_to_dtype
+from dl_bench.utils import Benchmark, get_report, get_time, str_to_dtype
 
 
 def get_llm(name, dtype):
-    if name != "gptj":
+    if name == "gptj":
+        model_name = "EleutherAI/gpt-j-6B"
+
+        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype)
+        tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
+    elif name == "llama2-13b":
+        kwargs = {}
+        if "HF_TOKEN" in os.environ:
+            kwargs["token"] = os.environ.get("HF_TOKEN")
+
+        model_name = "meta-llama/Llama-2-13b-hf"
+        model = LlamaForCausalLM.from_pretrained(
+            model_name, torch_dtype=dtype, **kwargs
+        )
+        tokenizer = LlamaTokenizer.from_pretrained(model_name, **kwargs)
+    else:
         raise ValueError("Unsupported model name")
-
-    model_name = "EleutherAI/gpt-j-6B"
-
-    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype)
-    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
     return tokenizer, model
 
 
 class LlmBenchmark(Benchmark):
     def __init__(self, params) -> None:
         name = params.get("name", "gptj")
         dtype = params.get("dtype")
+        self.batch_size = params.get("batch_size", 1)
+        self.n_iter = params.get("n_iter", 5)
+        self.warmup_batches = params.get("warmup", 2)
+
         self.tokenizer, self.model = get_llm(name, dtype=str_to_dtype(dtype))
-        self.warmup_prompt = "There are several ways to travel, but my favourite is"
-        self.prompt = "Here is a story about a person that find out he was adopted: one day little Timmy was looking through old"
+        prompt = "Here is a story about a person that find out he was adopted: one day little Timmy was looking through old"
+        self.prompt = [prompt] * self.batch_size
         self.gen_kwargs = {
             "early_stopping": True,
             "max_new_tokens": 128,
             "min_new_tokens": 30,
             "num_beams": 4,
         }
 
-    def generate(self, prompt, backend):
-        input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
+    def generate(self, backend):
         backend.sync()
         start = time.perf_counter()
-        input_ids = backend.to_device(input_ids)
+        input_tokens = self.tokenizer(self.prompt, return_tensors="pt").input_ids
+        input_tokens = backend.to_device(input_tokens)
         gen_tokens = self.model.generate(
-            input_ids, **self.gen_kwargs, pad_token_id=self.tokenizer.eos_token_id
+            input_tokens, **self.gen_kwargs, pad_token_id=self.tokenizer.eos_token_id
         )
         backend.sync()
+        text = self.tokenizer.batch_decode(gen_tokens)[0]
         total_time = time.perf_counter() - start
 
-        # text = self.tokenizer.batch_decode(gen_tokens)[0]
-        return gen_tokens[0], total_time
+        # new tokens are a subset of all tokens
+        output_tokens = gen_tokens[:, input_tokens.shape[1] :]
+        return output_tokens, total_time
 
     def inference(self, backend):
-        tm = TimerManager()
-
-        # Recover MACs computation
+        # TODO: Recover MACs computation
         # generate requires several forward passes, so need addtional algo to estimate
         # self.flops_per_sample = get_macs(self.model, self.in_shape, backend) * 2
-
         self.model = backend.prepare_eval_transformer(self.model)
 
-        print("Warmup started")
-        with torch.inference_mode(), tm.timeit("warmup_s"):
-            self.model.eval()
-            self.generate(self.warmup_prompt, backend)
-        print("Warmup done")
-
         self.model.eval()
         enabled = backend.dtype != torch.float32
-        with torch.inference_mode(), torch.autocast(
-            enabled=enabled, device_type=backend.device_name
-        ), tm.timeit("duration_s"):
-            tokens, total_time = self.generate(self.prompt, backend)
-        outputs = [tokens]
 
-        results = tm.get_results()
-        results["samples_per_s"] = len(tokens) / total_time
-        results["flops_per_sample"] = 1
+        n_items = 0
+        outputs = []
+        fw_times = []
 
-        return results, outputs
+        self.model.eval()
+        for i in range(self.n_iter):
+            print(f"Epoch {i+1}/{self.n_iter}")
+            cast = torch.autocast(enabled=enabled, device_type=backend.device_name)
+            with torch.inference_mode(), cast:
+                tokens, total_time = self.generate(backend)
+
+            if i < self.warmup_batches:
+                # We restart timer because that was just a warmup
+                start = get_time()
+                continue
+
+            print(f"Fw time: {total_time:.1f}")
+            fw_times.append(total_time)
+            n_items += math.prod(tokens.shape)
+            outputs.append(tokens)
+
+        stop = get_time()
+
+        report = get_report(
+            fw_times=fw_times,
+            duration_s=stop - start,
+            n_items=n_items,
+            flops_per_sample=1,
+        )
+        return report, outputs