From 7a4209a0fc77cb6be77d6884f647faaaed47fb52 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <mikekg@meta.com>
Date: Tue, 16 Apr 2024 22:55:22 -0700
Subject: [PATCH 01/12] prompt

---
 generate.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/generate.py b/generate.py
index 4d52b4c8b..8b78bbd3a 100644
--- a/generate.py
+++ b/generate.py
@@ -316,6 +316,7 @@ def _main(
     builder_args: BuilderArgs,
     speculative_builder_args: BuilderArgs,
     tokenizer_args: TokenizerArgs,
+    generator_args: GeneratorArgs,
     prompt: str = "Hello, my name is",
     chat_mode: bool = False,
     num_samples: int = 5,
@@ -365,7 +366,9 @@ def _main(
     else:
         draft_model = None
 
-    encoded = encode_tokens(tokenizer, prompt, bos=True, device=builder_args.device)
+    encoded = encode_tokens(
+        tokenizer, generator_args.prompt, bos=True, device=builder_args.device
+    )
     print(encoded)
     prompt_length = encoded.size(0)
 

From a4e09667202cf0b7e43f1485114ff0a77f433986 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <mikekg@meta.com>
Date: Tue, 16 Apr 2024 23:03:32 -0700
Subject: [PATCH 02/12] chat_mode, num_samples

---
 generate.py | 43 ++++++++++++++++++++-----------------------
 1 file changed, 20 insertions(+), 23 deletions(-)

diff --git a/generate.py b/generate.py
index 8b78bbd3a..4c1fb679a 100644
--- a/generate.py
+++ b/generate.py
@@ -31,22 +31,22 @@
 @dataclass
 class GeneratorArgs:
     prompt: str = "torchchat is pronounced torch-chat and is so cool because"
-    chat: bool = (False,)
-    gui: bool = (False,)
-    num_samples: int = (1,)
-    max_new_tokens: int = (200,)
-    top_k: int = (200,)
-    temperature: int = (0,)  # deterministic argmax
-    compile: bool = (False,)
-    compile_prefill: bool = (False,)
-    speculate_k: int = (5,)
+    chat_mode: bool = False
+    gui_mode: bool = False
+    num_samples: int = 1
+    max_new_tokens: int = 200
+    top_k: int = 200
+    temperature: int = 0  # deterministic argmax
+    compile: bool = False
+    compile_prefill: bool = False
+    speculate_k: int = 5
 
     @classmethod
     def from_args(cls, args):  # -> GeneratorArgs:
         return cls(
             prompt=args.prompt,
-            chat=args.chat,
-            gui=args.gui,
+            chat_mode=args.chat,
+            gui_mode=args.gui,
             num_samples=args.num_samples,
             max_new_tokens=args.max_new_tokens,
             top_k=args.top_k,
@@ -317,9 +317,6 @@ def _main(
     speculative_builder_args: BuilderArgs,
     tokenizer_args: TokenizerArgs,
     generator_args: GeneratorArgs,
-    prompt: str = "Hello, my name is",
-    chat_mode: bool = False,
-    num_samples: int = 5,
     max_new_tokens: int = 100,
     top_k: int = 200,
     temperature: float = 0.8,
@@ -407,9 +404,9 @@ def _main(
     }
     start = -1 if compile else 0
 
-    for i in range(start, num_samples):
+    for i in range(start, generator_args.num_samples):
         device_sync(device=builder_args.device)
-        if i >= 0 and chat_mode:
+        if i >= 0 and generator_args.chat_mode:
             prompt = input("What is your prompt? ")
             if is_chat:
                 prompt = f"{B_INST} {prompt.strip()} {E_INST}"
@@ -417,7 +414,7 @@ def _main(
                 tokenizer, prompt, bos=True, device=builder_args.device
             )
 
-        if chat_mode and i >= 0:
+        if generator_args.chat_mode and i >= 0:
             buffer = []
             period_id = tokenizer.encode(".")[0]
             done_generating = False
@@ -439,7 +436,7 @@ def callback(x):
         t0 = time.perf_counter()
         import contextlib
 
-        if (i != num_samples - 1 or not profile) or (use_tp and rank != 0):
+        if (i != generator_args.num_samples - 1 or not profile) or (use_tp and rank != 0):
             prof = contextlib.nullcontext()
         else:
             torch.profiler._utils._init_for_cuda_graphs()
@@ -451,7 +448,7 @@ def callback(x):
                 max_new_tokens,
                 draft_model=draft_model,
                 speculate_k=speculate_k,
-                chat_mode=chat_mode,
+                chat_mode=generator_args.chat_mode,
                 callback=callback,
                 temperature=temperature,
                 top_k=top_k,
@@ -468,7 +465,7 @@ def callback(x):
         device_sync(device=builder_args.device)
         t = time.perf_counter() - t0
 
-        if not chat_mode:
+        if not generator_args.chat_mode:
             print(tokenizer.decode(y.tolist()))
         else:
             print()
@@ -498,13 +495,13 @@ def main(args):
     builder_args = BuilderArgs.from_args(args)
     speculative_builder_args = BuilderArgs.from_speculative_args(args)
     tokenizer_args = TokenizerArgs.from_args(args)
+    generator_args = GeneratorArgs.from_args(args)
+    
     _main(
         builder_args,
         speculative_builder_args,
         tokenizer_args,
-        args.prompt,
-        args.chat,
-        args.num_samples,
+        generator_args,
         args.max_new_tokens,
         args.top_k,
         args.temperature,

From 7af1832667d0cf9a9175ee2499dd5f4f4e788c38 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <mikekg@meta.com>
Date: Tue, 16 Apr 2024 23:22:35 -0700
Subject: [PATCH 03/12] move more args

---
 generate.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/generate.py b/generate.py
index 4c1fb679a..86fb596ae 100644
--- a/generate.py
+++ b/generate.py
@@ -317,9 +317,6 @@ def _main(
     speculative_builder_args: BuilderArgs,
     tokenizer_args: TokenizerArgs,
     generator_args: GeneratorArgs,
-    max_new_tokens: int = 100,
-    top_k: int = 200,
-    temperature: float = 0.8,
     compile: bool = True,
     compile_prefill: bool = False,
     profile: Optional[Path] = None,
@@ -445,13 +442,13 @@ def callback(x):
             y, metrics = generate(
                 model,
                 encoded,
-                max_new_tokens,
+                generator_args.max_new_tokens,
                 draft_model=draft_model,
                 speculate_k=speculate_k,
                 chat_mode=generator_args.chat_mode,
                 callback=callback,
-                temperature=temperature,
-                top_k=top_k,
+                temperature=generator_args.temperature,
+                top_k=generator_args.top_k,
             )
             aggregate_metrics["accept_counts"].append(metrics["accept_counts"])
         if i == -1:
@@ -502,9 +499,6 @@ def main(args):
         speculative_builder_args,
         tokenizer_args,
         generator_args,
-        args.max_new_tokens,
-        args.top_k,
-        args.temperature,
         args.compile,
         args.compile_prefill,
         args.profile,

From ad0fbec8ab8bae8ae53b2a3e16054cd5b83adb44 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <mikekg@meta.com>
Date: Tue, 16 Apr 2024 23:33:52 -0700
Subject: [PATCH 04/12] more gen args

---
 generate.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/generate.py b/generate.py
index 86fb596ae..bc56ac365 100644
--- a/generate.py
+++ b/generate.py
@@ -31,6 +31,7 @@
 @dataclass
 class GeneratorArgs:
     prompt: str = "torchchat is pronounced torch-chat and is so cool because"
+    encoded_prompt: Optional[torch.Tensor] = None
     chat_mode: bool = False
     gui_mode: bool = False
     num_samples: int = 1
@@ -45,6 +46,7 @@ class GeneratorArgs:
     def from_args(cls, args):  # -> GeneratorArgs:
         return cls(
             prompt=args.prompt,
+            encoded_prompt=None,
             chat_mode=args.chat,
             gui_mode=args.gui,
             num_samples=args.num_samples,
@@ -229,8 +231,7 @@ def speculative_decode(
 @torch.no_grad()
 def generate(
     model: Transformer,
-    prompt: torch.Tensor,
-    max_new_tokens: int,
+    generator_args: Generator_Args,
     *,
     chat_mode: bool,
     draft_model: Transformer,
@@ -241,11 +242,11 @@ def generate(
     """
     Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested.
     """
-
+    prompt = generator_args.encoded_prompt
     is_speculative = draft_model is not None
     # create an empty tensor of the expected final shape and fill in the current tokens
     T = prompt.size(0)
-    T_new = T + max_new_tokens
+    T_new = T + generator_args.max_new_tokens
     if chat_mode:
         max_seq_length = 350
     else:
@@ -295,7 +296,7 @@ def generate(
             model,
             next_token.view(1, -1),
             input_pos,
-            max_new_tokens - 1,
+            generator_args.max_new_tokens - 1,
             callback=callback,
             **sampling_kwargs,
         )
@@ -305,7 +306,7 @@ def generate(
     return seq, generate_stats
 
 
-def encode_tokens(tokenizer, string, bos=True, device="cuda"):
+def encode_tokens(tokenizer, string, bos=True, device):
     tokens = tokenizer.encode(string)
     if bos:
         tokens = [tokenizer.bos_id()] + tokens
@@ -320,7 +321,6 @@ def _main(
     compile: bool = True,
     compile_prefill: bool = False,
     profile: Optional[Path] = None,
-    speculate_k: int = 5,
     quantize=None,
 ) -> None:
     """Generates text samples based on a pre-trained Transformer model and tokenizer."""
@@ -441,10 +441,8 @@ def callback(x):
         with prof:
             y, metrics = generate(
                 model,
-                encoded,
-                generator_args.max_new_tokens,
                 draft_model=draft_model,
-                speculate_k=speculate_k,
+                speculate_k=generator_args.speculate_k,
                 chat_mode=generator_args.chat_mode,
                 callback=callback,
                 temperature=generator_args.temperature,

From 3f49018b62b3fe6fdb8c6894bbd3d0e2b4c035e6 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <mikekg@meta.com>
Date: Wed, 17 Apr 2024 05:55:00 -0700
Subject: [PATCH 05/12] update

---
 generate.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/generate.py b/generate.py
index bc56ac365..11b62fc7d 100644
--- a/generate.py
+++ b/generate.py
@@ -233,7 +233,6 @@ def generate(
     model: Transformer,
     generator_args: Generator_Args,
     *,
-    chat_mode: bool,
     draft_model: Transformer,
     speculate_k: Optional[int] = 8,
     callback=lambda x: x,
@@ -247,7 +246,7 @@ def generate(
     # create an empty tensor of the expected final shape and fill in the current tokens
     T = prompt.size(0)
     T_new = T + generator_args.max_new_tokens
-    if chat_mode:
+    if generator_args.chat_mode:
         max_seq_length = 350
     else:
         max_seq_length = min(T_new, model.config.block_size)

From b961023da264c5acf7c5f9e308fa3c212e271349 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <mikekg@meta.com>
Date: Wed, 17 Apr 2024 05:57:22 -0700
Subject: [PATCH 06/12] args

---
 generate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/generate.py b/generate.py
index 11b62fc7d..bb43b764b 100644
--- a/generate.py
+++ b/generate.py
@@ -305,7 +305,7 @@ def generate(
     return seq, generate_stats
 
 
-def encode_tokens(tokenizer, string, bos=True, device):
+def encode_tokens(tokenizer, string, bos=True, device="cpu"):
     tokens = tokenizer.encode(string)
     if bos:
         tokens = [tokenizer.bos_id()] + tokens

From 5162b9e6cab2fe420f22ed8ca3a792d66f964831 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <mikekg@meta.com>
Date: Wed, 17 Apr 2024 06:08:27 -0700
Subject: [PATCH 07/12] undo some changes

---
 generate.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/generate.py b/generate.py
index bb43b764b..19599ad08 100644
--- a/generate.py
+++ b/generate.py
@@ -231,8 +231,10 @@ def speculative_decode(
 @torch.no_grad()
 def generate(
     model: Transformer,
-    generator_args: Generator_Args,
+    prompt: torch.Tensor,
+    max_new_tokens: int,
     *,
+    chat_mode: bool,
     draft_model: Transformer,
     speculate_k: Optional[int] = 8,
     callback=lambda x: x,
@@ -241,12 +243,12 @@ def generate(
     """
     Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested.
     """
-    prompt = generator_args.encoded_prompt
+
     is_speculative = draft_model is not None
     # create an empty tensor of the expected final shape and fill in the current tokens
     T = prompt.size(0)
-    T_new = T + generator_args.max_new_tokens
-    if generator_args.chat_mode:
+    T_new = T + max_new_tokens
+    if chat_mode:
         max_seq_length = 350
     else:
         max_seq_length = min(T_new, model.config.block_size)
@@ -295,7 +297,7 @@ def generate(
             model,
             next_token.view(1, -1),
             input_pos,
-            generator_args.max_new_tokens - 1,
+            max_new_tokens - 1,
             callback=callback,
             **sampling_kwargs,
         )
@@ -432,6 +434,7 @@ def callback(x):
         t0 = time.perf_counter()
         import contextlib
 
+        generator_args.encoded_prompt = encoded
         if (i != generator_args.num_samples - 1 or not profile) or (use_tp and rank != 0):
             prof = contextlib.nullcontext()
         else:
@@ -440,12 +443,14 @@ def callback(x):
         with prof:
             y, metrics = generate(
                 model,
+                encoded,
+                max_new_tokens,
                 draft_model=draft_model,
-                speculate_k=generator_args.speculate_k,
+                speculate_k=speculate_k,
                 chat_mode=generator_args.chat_mode,
                 callback=callback,
-                temperature=generator_args.temperature,
-                top_k=generator_args.top_k,
+                temperature=temperature,
+                top_k=top_k,
             )
             aggregate_metrics["accept_counts"].append(metrics["accept_counts"])
         if i == -1:
@@ -499,7 +504,6 @@ def main(args):
         args.compile,
         args.compile_prefill,
         args.profile,
-        args.speculate_k,
         args.quantize,
     )
 

From 0846920e191b9ca0d05d978f0b4d820fa9a74141 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <mikekg@meta.com>
Date: Wed, 17 Apr 2024 06:10:30 -0700
Subject: [PATCH 08/12] typos

---
 generate.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/generate.py b/generate.py
index 19599ad08..a1ede96e3 100644
--- a/generate.py
+++ b/generate.py
@@ -444,13 +444,13 @@ def callback(x):
             y, metrics = generate(
                 model,
                 encoded,
-                max_new_tokens,
+                generator_args.max_new_tokens,
                 draft_model=draft_model,
-                speculate_k=speculate_k,
+                speculate_k=generator_args.speculate_k,
                 chat_mode=generator_args.chat_mode,
                 callback=callback,
-                temperature=temperature,
-                top_k=top_k,
+                temperature=generator_args.temperature,
+                top_k=generator_args.top_k,
             )
             aggregate_metrics["accept_counts"].append(metrics["accept_counts"])
         if i == -1:

From 95da42123d515d33510e27bbb764454691b07c51 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <mikekg@meta.com>
Date: Wed, 17 Apr 2024 06:24:54 -0700
Subject: [PATCH 09/12] typo

---
 .github/workflows/compile_t4-dtype.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/compile_t4-dtype.yml b/.github/workflows/compile_t4-dtype.yml
index e0e3259c2..2a6980cb5 100644
--- a/.github/workflows/compile_t4-dtype.yml
+++ b/.github/workflows/compile_t4-dtype.yml
@@ -68,7 +68,7 @@ jobs:
         echo "******************************************"
           python generate.py --dtype ${DTYPE} --device cuda --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
           cat ./output_eager
-        python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{"  embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
+        python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
           cat ./output_compiled
         python export.py --dtype ${DTYPE} --device cuda --quant '{"embedding" :   {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
           python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti  

From cf9bea7a90d636429c53ba8417e76dba7ec29f1d Mon Sep 17 00:00:00 2001
From: Michael Gschwind <mikekg@meta.com>
Date: Wed, 17 Apr 2024 06:31:41 -0700
Subject: [PATCH 10/12] fix tests

---
 .../{compile-bf16.yml => compile-dtype.yml}   |  5 +-
 .github/workflows/eager-dtype.yml             | 87 +++++++++++++++++++
 2 files changed, 88 insertions(+), 4 deletions(-)
 rename .github/workflows/{compile-bf16.yml => compile-dtype.yml} (98%)
 create mode 100644 .github/workflows/eager-dtype.yml

diff --git a/.github/workflows/compile-bf16.yml b/.github/workflows/compile-dtype.yml
similarity index 98%
rename from .github/workflows/compile-bf16.yml
rename to .github/workflows/compile-dtype.yml
index c255e3a60..ec99f9e3f 100644
--- a/.github/workflows/compile-bf16.yml
+++ b/.github/workflows/compile-dtype.yml
@@ -11,7 +11,7 @@ jobs:
   run-tinystories:
     strategy:
       matrix:
-        runner: [ubuntu-latest, macos-14, macos-12]
+        runner: [ubuntu-latest, macos-14]
     runs-on: ${{matrix.runner}}
     steps:
       - name: Checkout repo
@@ -102,9 +102,6 @@ jobs:
             echo "******************************************"
             echo "******** INT4 group-wise quantized *******"
             echo "******************************************"
-              if [ ${DTYPE} == float16 ]; then
-              DTYPE=bfloat16
-            fi
   
             python generate.py --dtype ${DTYPE} --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
             cat ./output_eager
diff --git a/.github/workflows/eager-dtype.yml b/.github/workflows/eager-dtype.yml
new file mode 100644
index 000000000..f8f564b7b
--- /dev/null
+++ b/.github/workflows/eager-dtype.yml
@@ -0,0 +1,87 @@
+name: Compile-dtype main
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+  workflow_dispatch:
+
+jobs:
+  run-tinystories:
+    strategy:
+      matrix:
+        runner: [macos-12]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v2
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.11
+      - name: Print machine info
+        run: |
+          uname -a
+          if [ $(uname -s) == Darwin ]; then
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+          fi
+      - name: Install requirements
+        run: |
+          pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
+          pip install -r requirements.txt
+      - name: Download checkpoints
+        run: |
+          mkdir -p checkpoints/stories15M
+          pushd checkpoints/stories15M
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
+          wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
+          popd
+      - name: Run inference
+        run: |          
+          export MODEL_PATH=checkpoints/stories15M/stories15M.pt
+          export MODEL_NAME=stories15M
+          export MODEL_DIR=/tmp
+          for DTYPE in bfloat16 float16 float32; do
+            # if [ $(uname -s) == Darwin ]; then
+            #   export DTYPE=float16
+            # fi  
+            python generate.py --dtype ${DTYPE} --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+            cat ./output_eager
+  
+            echo "******************************************"
+            echo "******* Emb: channel-wise quantized ******"
+            echo "******************************************"
+            python generate.py --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+            cat ./output_eager
+  
+            echo "******************************************"
+            echo "******** Emb: group-wise quantized *******"
+            echo "******************************************"
+            python generate.py --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+            cat ./output_eager
+  
+            echo "******************************************"
+            echo "******* INT8 channel-wise quantized ******"
+            echo "******************************************"
+            python generate.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+            cat ./output_eager
+  
+            echo "******************************************"
+            echo "******** INT8 group-wise quantized *******"
+            echo "******************************************"
+            python generate.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+            cat ./output_eager
+  
+            echo "******************************************"
+            echo "******** INT4 group-wise quantized *******"
+            echo "******************************************"
+  
+            python generate.py --dtype ${DTYPE} --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+            cat ./output_eager
+  
+            echo "tests complete for ${DTYPE}"
+            done
+          
+          echo "tests complete for all dtypes!"
\ No newline at end of file

From 4d64713fc777414821de405b91388f25e14a91bd Mon Sep 17 00:00:00 2001
From: Michael Gschwind <mikekg@meta.com>
Date: Wed, 17 Apr 2024 06:49:53 -0700
Subject: [PATCH 11/12] typo

---
 .github/workflows/compile_t4-dtype.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/compile_t4-dtype.yml b/.github/workflows/compile_t4-dtype.yml
index 2a6980cb5..8c15eb4b1 100644
--- a/.github/workflows/compile_t4-dtype.yml
+++ b/.github/workflows/compile_t4-dtype.yml
@@ -79,7 +79,7 @@ jobs:
         echo "******************************************"
           python generate.py --dtype ${DTYPE} --device cuda --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
           cat ./output_eager
-        python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{"  linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
+        python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
           cat ./output_compiled
         python export.py --dtype ${DTYPE} --device cuda --quant '{"linear:int8"   : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
           python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti  

From 3e48b0060dde25f6a5d7ab514173aed288ea7e72 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <mikekg@meta.com>
Date: Wed, 17 Apr 2024 06:53:21 -0700
Subject: [PATCH 12/12] typo

---
 .github/workflows/eager-dtype.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/eager-dtype.yml b/.github/workflows/eager-dtype.yml
index f8f564b7b..d73832dde 100644
--- a/.github/workflows/eager-dtype.yml
+++ b/.github/workflows/eager-dtype.yml
@@ -1,4 +1,4 @@
-name: Compile-dtype main
+name: Eager-dtype main
 
 on:
   push: