diff --git a/.github/workflows/compile-bf16.yml b/.github/workflows/compile-dtype.yml similarity index 98% rename from .github/workflows/compile-bf16.yml rename to .github/workflows/compile-dtype.yml index c255e3a60..ec99f9e3f 100644 --- a/.github/workflows/compile-bf16.yml +++ b/.github/workflows/compile-dtype.yml @@ -11,7 +11,7 @@ jobs: run-tinystories: strategy: matrix: - runner: [ubuntu-latest, macos-14, macos-12] + runner: [ubuntu-latest, macos-14] runs-on: ${{matrix.runner}} steps: - name: Checkout repo @@ -102,9 +102,6 @@ jobs: echo "******************************************" echo "******** INT4 group-wise quantized *******" echo "******************************************" - if [ ${DTYPE} == float16 ]; then - DTYPE=bfloat16 - fi python generate.py --dtype ${DTYPE} --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager cat ./output_eager diff --git a/.github/workflows/compile_t4-dtype.yml b/.github/workflows/compile_t4-dtype.yml index e0e3259c2..8c15eb4b1 100644 --- a/.github/workflows/compile_t4-dtype.yml +++ b/.github/workflows/compile_t4-dtype.yml @@ -68,7 +68,7 @@ jobs: echo "******************************************" python generate.py --dtype ${DTYPE} --device cuda --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager cat ./output_eager - python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{" embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled + python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled cat ./output_compiled python export.py --dtype ${DTYPE} --device cuda --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti @@ -79,7 +79,7 @@ jobs: echo "******************************************" python generate.py --dtype ${DTYPE} --device cuda --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager cat ./output_eager - python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{" linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled + python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled cat ./output_compiled python export.py --dtype ${DTYPE} --device cuda --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti diff --git a/.github/workflows/eager-dtype.yml b/.github/workflows/eager-dtype.yml new file mode 100644 index 000000000..d73832dde --- /dev/null +++ b/.github/workflows/eager-dtype.yml @@ -0,0 +1,87 @@ +name: Eager-dtype main + +on: + push: + branches: + - main + pull_request: + workflow_dispatch: + +jobs: + run-tinystories: + strategy: + matrix: + runner: [macos-12] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout repo + uses: actions/checkout@v2 + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: 3.11 + - name: Print machine info + run: | + uname -a + if [ $(uname -s) == Darwin ]; then + sysctl machdep.cpu.brand_string + sysctl machdep.cpu.core_count + fi + - name: Install requirements + run: | + pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu + pip install -r requirements.txt + - name: Download checkpoints + run: | + mkdir -p checkpoints/stories15M + pushd checkpoints/stories15M + wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt + wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model + popd + - name: Run inference + run: | + export MODEL_PATH=checkpoints/stories15M/stories15M.pt + export MODEL_NAME=stories15M + export MODEL_DIR=/tmp + for DTYPE in bfloat16 float16 float32; do + # if [ $(uname -s) == Darwin ]; then + # export DTYPE=float16 + # fi + python generate.py --dtype ${DTYPE} --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager + cat ./output_eager + + echo "******************************************" + echo "******* Emb: channel-wise quantized ******" + echo "******************************************" + python generate.py --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager + cat ./output_eager + + echo "******************************************" + echo "******** Emb: group-wise quantized *******" + echo "******************************************" + python generate.py --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager + cat ./output_eager + + echo "******************************************" + echo "******* INT8 channel-wise quantized ******" + echo "******************************************" + python generate.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager + cat ./output_eager + + echo "******************************************" + echo "******** INT8 group-wise quantized *******" + echo "******************************************" + python generate.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager + cat ./output_eager + + echo "******************************************" + echo "******** INT4 group-wise quantized *******" + echo "******************************************" + + python generate.py --dtype ${DTYPE} --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager + cat ./output_eager + + echo "tests complete for ${DTYPE}" + done + + echo "tests complete for all dtypes!" \ No newline at end of file diff --git a/generate.py b/generate.py index 70288b450..a199c7e4b 100644 --- a/generate.py +++ b/generate.py @@ -31,6 +31,7 @@ @dataclass class GeneratorArgs: prompt: str = "torchchat is pronounced torch-chat and is so cool because" + encoded_prompt: Optional[torch.Tensor] = None chat_mode: bool = False gui_mode: bool = False num_samples: int = 1 @@ -45,6 +46,7 @@ class GeneratorArgs: def from_args(cls, args): # -> GeneratorArgs: return cls( prompt=args.prompt, + encoded_prompt=None, chat_mode=args.chat, gui_mode=args.gui, num_samples=args.num_samples, @@ -432,7 +434,6 @@ def callback(x): t0 = time.perf_counter() import contextlib - generator_args.encoded_prompt = encoded if (i != generator_args.num_samples - 1 or not profile) or ( use_tp and rank != 0 ):