pytorch · mikekgfb · Apr 17, 2024 · Apr 17, 2024 · Apr 17, 2024 · Apr 17, 2024
diff --git a/.github/workflows/compile-bf16.yml → .github/workflows/compile-dtype.yml b/.github/workflows/compile-bf16.yml → .github/workflows/compile-dtype.yml
@@ -11,7 +11,7 @@ jobs:
   run-tinystories:
     strategy:
       matrix:
-        runner: [ubuntu-latest, macos-14, macos-12]
+        runner: [ubuntu-latest, macos-14]
     runs-on: ${{matrix.runner}}
     steps:
       - name: Checkout repo
@@ -102,9 +102,6 @@ jobs:
             echo "******************************************"
             echo "******** INT4 group-wise quantized *******"
             echo "******************************************"
-              if [ ${DTYPE} == float16 ]; then
-              DTYPE=bfloat16
-            fi
 
             python generate.py --dtype ${DTYPE} --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
             cat ./output_eager

diff --git a/.github/workflows/compile_t4-dtype.yml b/.github/workflows/compile_t4-dtype.yml
@@ -68,7 +68,7 @@ jobs:
         echo "******************************************"
           python generate.py --dtype ${DTYPE} --device cuda --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
           cat ./output_eager
-        python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{"  embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
+        python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
           cat ./output_compiled
         python export.py --dtype ${DTYPE} --device cuda --quant '{"embedding" :   {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
           python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti  

diff --git a/.github/workflows/eager-dtype.yml b/.github/workflows/eager-dtype.yml
@@ -0,0 +1,87 @@
+name: Compile-dtype main
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+  workflow_dispatch:
+
+jobs:
+  run-tinystories:
+    strategy:
+      matrix:
+        runner: [macos-12]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v2
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.11
+      - name: Print machine info
+        run: |
+          uname -a
+          if [ $(uname -s) == Darwin ]; then
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+          fi
+      - name: Install requirements
+        run: |
+          pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
+          pip install -r requirements.txt
+      - name: Download checkpoints
+        run: |
+          mkdir -p checkpoints/stories15M
+          pushd checkpoints/stories15M
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
+          wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
+          popd
+      - name: Run inference
+        run: |          
+          export MODEL_PATH=checkpoints/stories15M/stories15M.pt
+          export MODEL_NAME=stories15M
+          export MODEL_DIR=/tmp
+          for DTYPE in bfloat16 float16 float32; do
+            # if [ $(uname -s) == Darwin ]; then
+            #   export DTYPE=float16
+            # fi  
+            python generate.py --dtype ${DTYPE} --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+            cat ./output_eager
+
+            echo "******************************************"
+            echo "******* Emb: channel-wise quantized ******"
+            echo "******************************************"
+            python generate.py --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+            cat ./output_eager
+
+            echo "******************************************"
+            echo "******** Emb: group-wise quantized *******"
+            echo "******************************************"
+            python generate.py --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+            cat ./output_eager
+
+            echo "******************************************"
+            echo "******* INT8 channel-wise quantized ******"
+            echo "******************************************"
+            python generate.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+            cat ./output_eager
+
+            echo "******************************************"
+            echo "******** INT8 group-wise quantized *******"
+            echo "******************************************"
+            python generate.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+            cat ./output_eager
+
+            echo "******************************************"
+            echo "******** INT4 group-wise quantized *******"
+            echo "******************************************"
+
+            python generate.py --dtype ${DTYPE} --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+            cat ./output_eager
+
+            echo "tests complete for ${DTYPE}"
+            done
+
+          echo "tests complete for all dtypes!"
diff --git a/generate.py b/generate.py
@@ -31,6 +31,7 @@
 @dataclass
 class GeneratorArgs:
     prompt: str = "torchchat is pronounced torch-chat and is so cool because"
+    encoded_prompt: Optional[torch.Tensor] = None
     chat_mode: bool = False
     gui_mode: bool = False
     num_samples: int = 1
@@ -45,6 +46,7 @@ class GeneratorArgs:
     def from_args(cls, args):  # -> GeneratorArgs:
         return cls(
             prompt=args.prompt,
+            encoded_prompt=None,
             chat_mode=args.chat,
             gui_mode=args.gui,
             num_samples=args.num_samples,
@@ -305,7 +307,7 @@ def generate(
     return seq, generate_stats
 
 
-def encode_tokens(tokenizer, string, bos=True, device="cuda"):
+def encode_tokens(tokenizer, string, bos=True, device="cpu"):
     tokens = tokenizer.encode(string)
     if bos:
         tokens = [tokenizer.bos_id()] + tokens
@@ -317,13 +319,9 @@ def _main(
     speculative_builder_args: BuilderArgs,
     tokenizer_args: TokenizerArgs,
     generator_args: GeneratorArgs,
-    max_new_tokens: int = 100,
-    top_k: int = 200,
-    temperature: float = 0.8,
     compile: bool = True,
     compile_prefill: bool = False,
     profile: Optional[Path] = None,
-    speculate_k: int = 5,
     quantize=None,
 ) -> None:
     """Generates text samples based on a pre-trained Transformer model and tokenizer."""
@@ -436,6 +434,7 @@ def callback(x):
         t0 = time.perf_counter()
         import contextlib
 
+        generator_args.encoded_prompt = encoded
         if (i != generator_args.num_samples - 1 or not profile) or (use_tp and rank != 0):
             prof = contextlib.nullcontext()
         else:
@@ -445,13 +444,13 @@ def callback(x):
             y, metrics = generate(
                 model,
                 encoded,
-                max_new_tokens,
+                generator_args.max_new_tokens,
                 draft_model=draft_model,
-                speculate_k=speculate_k,
+                speculate_k=generator_args.speculate_k,
                 chat_mode=generator_args.chat_mode,
                 callback=callback,
-                temperature=temperature,
-                top_k=top_k,
+                temperature=generator_args.temperature,
+                top_k=generator_args.top_k,
             )
             aggregate_metrics["accept_counts"].append(metrics["accept_counts"])
         if i == -1:
@@ -502,13 +501,9 @@ def main(args):
         speculative_builder_args,
         tokenizer_args,
         generator_args,
-        args.max_new_tokens,
-        args.top_k,
-        args.temperature,
         args.compile,
         args.compile_prefill,
         args.profile,
-        args.speculate_k,
         args.quantize,
     )