Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix tests #235

Merged
merged 15 commits into from
Apr 17, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
run-tinystories:
strategy:
matrix:
runner: [ubuntu-latest, macos-14, macos-12]
runner: [ubuntu-latest, macos-14]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout repo
Expand Down Expand Up @@ -102,9 +102,6 @@ jobs:
echo "******************************************"
echo "******** INT4 group-wise quantized *******"
echo "******************************************"
if [ ${DTYPE} == float16 ]; then
DTYPE=bfloat16
fi

python generate.py --dtype ${DTYPE} --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/compile_t4-dtype.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ jobs:
echo "******************************************"
python generate.py --dtype ${DTYPE} --device cuda --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager
python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{" embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
cat ./output_compiled
python export.py --dtype ${DTYPE} --device cuda --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti
Expand Down
87 changes: 87 additions & 0 deletions .github/workflows/eager-dtype.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
name: Compile-dtype main
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rename?


on:
push:
branches:
- main
pull_request:
workflow_dispatch:

jobs:
run-tinystories:
strategy:
matrix:
runner: [macos-12]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout repo
uses: actions/checkout@v2
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.11
- name: Print machine info
run: |
uname -a
if [ $(uname -s) == Darwin ]; then
sysctl machdep.cpu.brand_string
sysctl machdep.cpu.core_count
fi
- name: Install requirements
run: |
pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
pip install -r requirements.txt
- name: Download checkpoints
run: |
mkdir -p checkpoints/stories15M
pushd checkpoints/stories15M
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
popd
- name: Run inference
run: |
export MODEL_PATH=checkpoints/stories15M/stories15M.pt
export MODEL_NAME=stories15M
export MODEL_DIR=/tmp
for DTYPE in bfloat16 float16 float32; do
# if [ $(uname -s) == Darwin ]; then
# export DTYPE=float16
# fi
python generate.py --dtype ${DTYPE} --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager

echo "******************************************"
echo "******* Emb: channel-wise quantized ******"
echo "******************************************"
python generate.py --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager

echo "******************************************"
echo "******** Emb: group-wise quantized *******"
echo "******************************************"
python generate.py --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager

echo "******************************************"
echo "******* INT8 channel-wise quantized ******"
echo "******************************************"
python generate.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager

echo "******************************************"
echo "******** INT8 group-wise quantized *******"
echo "******************************************"
python generate.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager

echo "******************************************"
echo "******** INT4 group-wise quantized *******"
echo "******************************************"

python generate.py --dtype ${DTYPE} --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager

echo "tests complete for ${DTYPE}"
done

echo "tests complete for all dtypes!"
21 changes: 8 additions & 13 deletions generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
@dataclass
class GeneratorArgs:
prompt: str = "torchchat is pronounced torch-chat and is so cool because"
encoded_prompt: Optional[torch.Tensor] = None
chat_mode: bool = False
gui_mode: bool = False
num_samples: int = 1
Expand All @@ -45,6 +46,7 @@ class GeneratorArgs:
def from_args(cls, args): # -> GeneratorArgs:
return cls(
prompt=args.prompt,
encoded_prompt=None,
chat_mode=args.chat,
gui_mode=args.gui,
num_samples=args.num_samples,
Expand Down Expand Up @@ -305,7 +307,7 @@ def generate(
return seq, generate_stats


def encode_tokens(tokenizer, string, bos=True, device="cuda"):
def encode_tokens(tokenizer, string, bos=True, device="cpu"):
tokens = tokenizer.encode(string)
if bos:
tokens = [tokenizer.bos_id()] + tokens
Expand All @@ -317,13 +319,9 @@ def _main(
speculative_builder_args: BuilderArgs,
tokenizer_args: TokenizerArgs,
generator_args: GeneratorArgs,
max_new_tokens: int = 100,
top_k: int = 200,
temperature: float = 0.8,
compile: bool = True,
compile_prefill: bool = False,
profile: Optional[Path] = None,
speculate_k: int = 5,
quantize=None,
) -> None:
"""Generates text samples based on a pre-trained Transformer model and tokenizer."""
Expand Down Expand Up @@ -436,6 +434,7 @@ def callback(x):
t0 = time.perf_counter()
import contextlib

generator_args.encoded_prompt = encoded
if (i != generator_args.num_samples - 1 or not profile) or (use_tp and rank != 0):
prof = contextlib.nullcontext()
else:
Expand All @@ -445,13 +444,13 @@ def callback(x):
y, metrics = generate(
model,
encoded,
max_new_tokens,
generator_args.max_new_tokens,
draft_model=draft_model,
speculate_k=speculate_k,
speculate_k=generator_args.speculate_k,
chat_mode=generator_args.chat_mode,
callback=callback,
temperature=temperature,
top_k=top_k,
temperature=generator_args.temperature,
top_k=generator_args.top_k,
)
aggregate_metrics["accept_counts"].append(metrics["accept_counts"])
if i == -1:
Expand Down Expand Up @@ -502,13 +501,9 @@ def main(args):
speculative_builder_args,
tokenizer_args,
generator_args,
args.max_new_tokens,
args.top_k,
args.temperature,
args.compile,
args.compile_prefill,
args.profile,
args.speculate_k,
args.quantize,
)

Expand Down
Loading