From 7a4209a0fc77cb6be77d6884f647faaaed47fb52 Mon Sep 17 00:00:00 2001 From: Michael Gschwind Date: Tue, 16 Apr 2024 22:55:22 -0700 Subject: [PATCH 01/12] prompt --- generate.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/generate.py b/generate.py index 4d52b4c8b..8b78bbd3a 100644 --- a/generate.py +++ b/generate.py @@ -316,6 +316,7 @@ def _main( builder_args: BuilderArgs, speculative_builder_args: BuilderArgs, tokenizer_args: TokenizerArgs, + generator_args: GeneratorArgs, prompt: str = "Hello, my name is", chat_mode: bool = False, num_samples: int = 5, @@ -365,7 +366,9 @@ def _main( else: draft_model = None - encoded = encode_tokens(tokenizer, prompt, bos=True, device=builder_args.device) + encoded = encode_tokens( + tokenizer, generator_args.prompt, bos=True, device=builder_args.device + ) print(encoded) prompt_length = encoded.size(0) From a4e09667202cf0b7e43f1485114ff0a77f433986 Mon Sep 17 00:00:00 2001 From: Michael Gschwind Date: Tue, 16 Apr 2024 23:03:32 -0700 Subject: [PATCH 02/12] chat_mode, num_samples --- generate.py | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/generate.py b/generate.py index 8b78bbd3a..4c1fb679a 100644 --- a/generate.py +++ b/generate.py @@ -31,22 +31,22 @@ @dataclass class GeneratorArgs: prompt: str = "torchchat is pronounced torch-chat and is so cool because" - chat: bool = (False,) - gui: bool = (False,) - num_samples: int = (1,) - max_new_tokens: int = (200,) - top_k: int = (200,) - temperature: int = (0,) # deterministic argmax - compile: bool = (False,) - compile_prefill: bool = (False,) - speculate_k: int = (5,) + chat_mode: bool = False + gui_mode: bool = False + num_samples: int = 1 + max_new_tokens: int = 200 + top_k: int = 200 + temperature: int = 0 # deterministic argmax + compile: bool = False + compile_prefill: bool = False + speculate_k: int = 5 @classmethod def from_args(cls, args): # -> GeneratorArgs: return cls( prompt=args.prompt, - chat=args.chat, - gui=args.gui, + chat_mode=args.chat, + gui_mode=args.gui, num_samples=args.num_samples, max_new_tokens=args.max_new_tokens, top_k=args.top_k, @@ -317,9 +317,6 @@ def _main( speculative_builder_args: BuilderArgs, tokenizer_args: TokenizerArgs, generator_args: GeneratorArgs, - prompt: str = "Hello, my name is", - chat_mode: bool = False, - num_samples: int = 5, max_new_tokens: int = 100, top_k: int = 200, temperature: float = 0.8, @@ -407,9 +404,9 @@ def _main( } start = -1 if compile else 0 - for i in range(start, num_samples): + for i in range(start, generator_args.num_samples): device_sync(device=builder_args.device) - if i >= 0 and chat_mode: + if i >= 0 and generator_args.chat_mode: prompt = input("What is your prompt? ") if is_chat: prompt = f"{B_INST} {prompt.strip()} {E_INST}" @@ -417,7 +414,7 @@ def _main( tokenizer, prompt, bos=True, device=builder_args.device ) - if chat_mode and i >= 0: + if generator_args.chat_mode and i >= 0: buffer = [] period_id = tokenizer.encode(".")[0] done_generating = False @@ -439,7 +436,7 @@ def callback(x): t0 = time.perf_counter() import contextlib - if (i != num_samples - 1 or not profile) or (use_tp and rank != 0): + if (i != generator_args.num_samples - 1 or not profile) or (use_tp and rank != 0): prof = contextlib.nullcontext() else: torch.profiler._utils._init_for_cuda_graphs() @@ -451,7 +448,7 @@ def callback(x): max_new_tokens, draft_model=draft_model, speculate_k=speculate_k, - chat_mode=chat_mode, + chat_mode=generator_args.chat_mode, callback=callback, temperature=temperature, top_k=top_k, @@ -468,7 +465,7 @@ def callback(x): device_sync(device=builder_args.device) t = time.perf_counter() - t0 - if not chat_mode: + if not generator_args.chat_mode: print(tokenizer.decode(y.tolist())) else: print() @@ -498,13 +495,13 @@ def main(args): builder_args = BuilderArgs.from_args(args) speculative_builder_args = BuilderArgs.from_speculative_args(args) tokenizer_args = TokenizerArgs.from_args(args) + generator_args = GeneratorArgs.from_args(args) + _main( builder_args, speculative_builder_args, tokenizer_args, - args.prompt, - args.chat, - args.num_samples, + generator_args, args.max_new_tokens, args.top_k, args.temperature, From 7af1832667d0cf9a9175ee2499dd5f4f4e788c38 Mon Sep 17 00:00:00 2001 From: Michael Gschwind Date: Tue, 16 Apr 2024 23:22:35 -0700 Subject: [PATCH 03/12] move more args --- generate.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/generate.py b/generate.py index 4c1fb679a..86fb596ae 100644 --- a/generate.py +++ b/generate.py @@ -317,9 +317,6 @@ def _main( speculative_builder_args: BuilderArgs, tokenizer_args: TokenizerArgs, generator_args: GeneratorArgs, - max_new_tokens: int = 100, - top_k: int = 200, - temperature: float = 0.8, compile: bool = True, compile_prefill: bool = False, profile: Optional[Path] = None, @@ -445,13 +442,13 @@ def callback(x): y, metrics = generate( model, encoded, - max_new_tokens, + generator_args.max_new_tokens, draft_model=draft_model, speculate_k=speculate_k, chat_mode=generator_args.chat_mode, callback=callback, - temperature=temperature, - top_k=top_k, + temperature=generator_args.temperature, + top_k=generator_args.top_k, ) aggregate_metrics["accept_counts"].append(metrics["accept_counts"]) if i == -1: @@ -502,9 +499,6 @@ def main(args): speculative_builder_args, tokenizer_args, generator_args, - args.max_new_tokens, - args.top_k, - args.temperature, args.compile, args.compile_prefill, args.profile, From ad0fbec8ab8bae8ae53b2a3e16054cd5b83adb44 Mon Sep 17 00:00:00 2001 From: Michael Gschwind Date: Tue, 16 Apr 2024 23:33:52 -0700 Subject: [PATCH 04/12] more gen args --- generate.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/generate.py b/generate.py index 86fb596ae..bc56ac365 100644 --- a/generate.py +++ b/generate.py @@ -31,6 +31,7 @@ @dataclass class GeneratorArgs: prompt: str = "torchchat is pronounced torch-chat and is so cool because" + encoded_prompt: Optional[torch.Tensor] = None chat_mode: bool = False gui_mode: bool = False num_samples: int = 1 @@ -45,6 +46,7 @@ class GeneratorArgs: def from_args(cls, args): # -> GeneratorArgs: return cls( prompt=args.prompt, + encoded_prompt=None, chat_mode=args.chat, gui_mode=args.gui, num_samples=args.num_samples, @@ -229,8 +231,7 @@ def speculative_decode( @torch.no_grad() def generate( model: Transformer, - prompt: torch.Tensor, - max_new_tokens: int, + generator_args: Generator_Args, *, chat_mode: bool, draft_model: Transformer, @@ -241,11 +242,11 @@ def generate( """ Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested. """ - + prompt = generator_args.encoded_prompt is_speculative = draft_model is not None # create an empty tensor of the expected final shape and fill in the current tokens T = prompt.size(0) - T_new = T + max_new_tokens + T_new = T + generator_args.max_new_tokens if chat_mode: max_seq_length = 350 else: @@ -295,7 +296,7 @@ def generate( model, next_token.view(1, -1), input_pos, - max_new_tokens - 1, + generator_args.max_new_tokens - 1, callback=callback, **sampling_kwargs, ) @@ -305,7 +306,7 @@ def generate( return seq, generate_stats -def encode_tokens(tokenizer, string, bos=True, device="cuda"): +def encode_tokens(tokenizer, string, bos=True, device): tokens = tokenizer.encode(string) if bos: tokens = [tokenizer.bos_id()] + tokens @@ -320,7 +321,6 @@ def _main( compile: bool = True, compile_prefill: bool = False, profile: Optional[Path] = None, - speculate_k: int = 5, quantize=None, ) -> None: """Generates text samples based on a pre-trained Transformer model and tokenizer.""" @@ -441,10 +441,8 @@ def callback(x): with prof: y, metrics = generate( model, - encoded, - generator_args.max_new_tokens, draft_model=draft_model, - speculate_k=speculate_k, + speculate_k=generator_args.speculate_k, chat_mode=generator_args.chat_mode, callback=callback, temperature=generator_args.temperature, From 3f49018b62b3fe6fdb8c6894bbd3d0e2b4c035e6 Mon Sep 17 00:00:00 2001 From: Michael Gschwind Date: Wed, 17 Apr 2024 05:55:00 -0700 Subject: [PATCH 05/12] update --- generate.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/generate.py b/generate.py index bc56ac365..11b62fc7d 100644 --- a/generate.py +++ b/generate.py @@ -233,7 +233,6 @@ def generate( model: Transformer, generator_args: Generator_Args, *, - chat_mode: bool, draft_model: Transformer, speculate_k: Optional[int] = 8, callback=lambda x: x, @@ -247,7 +246,7 @@ def generate( # create an empty tensor of the expected final shape and fill in the current tokens T = prompt.size(0) T_new = T + generator_args.max_new_tokens - if chat_mode: + if generator_args.chat_mode: max_seq_length = 350 else: max_seq_length = min(T_new, model.config.block_size) From b961023da264c5acf7c5f9e308fa3c212e271349 Mon Sep 17 00:00:00 2001 From: Michael Gschwind Date: Wed, 17 Apr 2024 05:57:22 -0700 Subject: [PATCH 06/12] args --- generate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generate.py b/generate.py index 11b62fc7d..bb43b764b 100644 --- a/generate.py +++ b/generate.py @@ -305,7 +305,7 @@ def generate( return seq, generate_stats -def encode_tokens(tokenizer, string, bos=True, device): +def encode_tokens(tokenizer, string, bos=True, device="cpu"): tokens = tokenizer.encode(string) if bos: tokens = [tokenizer.bos_id()] + tokens From 5162b9e6cab2fe420f22ed8ca3a792d66f964831 Mon Sep 17 00:00:00 2001 From: Michael Gschwind Date: Wed, 17 Apr 2024 06:08:27 -0700 Subject: [PATCH 07/12] undo some changes --- generate.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/generate.py b/generate.py index bb43b764b..19599ad08 100644 --- a/generate.py +++ b/generate.py @@ -231,8 +231,10 @@ def speculative_decode( @torch.no_grad() def generate( model: Transformer, - generator_args: Generator_Args, + prompt: torch.Tensor, + max_new_tokens: int, *, + chat_mode: bool, draft_model: Transformer, speculate_k: Optional[int] = 8, callback=lambda x: x, @@ -241,12 +243,12 @@ def generate( """ Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested. """ - prompt = generator_args.encoded_prompt + is_speculative = draft_model is not None # create an empty tensor of the expected final shape and fill in the current tokens T = prompt.size(0) - T_new = T + generator_args.max_new_tokens - if generator_args.chat_mode: + T_new = T + max_new_tokens + if chat_mode: max_seq_length = 350 else: max_seq_length = min(T_new, model.config.block_size) @@ -295,7 +297,7 @@ def generate( model, next_token.view(1, -1), input_pos, - generator_args.max_new_tokens - 1, + max_new_tokens - 1, callback=callback, **sampling_kwargs, ) @@ -432,6 +434,7 @@ def callback(x): t0 = time.perf_counter() import contextlib + generator_args.encoded_prompt = encoded if (i != generator_args.num_samples - 1 or not profile) or (use_tp and rank != 0): prof = contextlib.nullcontext() else: @@ -440,12 +443,14 @@ def callback(x): with prof: y, metrics = generate( model, + encoded, + max_new_tokens, draft_model=draft_model, - speculate_k=generator_args.speculate_k, + speculate_k=speculate_k, chat_mode=generator_args.chat_mode, callback=callback, - temperature=generator_args.temperature, - top_k=generator_args.top_k, + temperature=temperature, + top_k=top_k, ) aggregate_metrics["accept_counts"].append(metrics["accept_counts"]) if i == -1: @@ -499,7 +504,6 @@ def main(args): args.compile, args.compile_prefill, args.profile, - args.speculate_k, args.quantize, ) From 0846920e191b9ca0d05d978f0b4d820fa9a74141 Mon Sep 17 00:00:00 2001 From: Michael Gschwind Date: Wed, 17 Apr 2024 06:10:30 -0700 Subject: [PATCH 08/12] typos --- generate.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/generate.py b/generate.py index 19599ad08..a1ede96e3 100644 --- a/generate.py +++ b/generate.py @@ -444,13 +444,13 @@ def callback(x): y, metrics = generate( model, encoded, - max_new_tokens, + generator_args.max_new_tokens, draft_model=draft_model, - speculate_k=speculate_k, + speculate_k=generator_args.speculate_k, chat_mode=generator_args.chat_mode, callback=callback, - temperature=temperature, - top_k=top_k, + temperature=generator_args.temperature, + top_k=generator_args.top_k, ) aggregate_metrics["accept_counts"].append(metrics["accept_counts"]) if i == -1: From 95da42123d515d33510e27bbb764454691b07c51 Mon Sep 17 00:00:00 2001 From: Michael Gschwind Date: Wed, 17 Apr 2024 06:24:54 -0700 Subject: [PATCH 09/12] typo --- .github/workflows/compile_t4-dtype.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/compile_t4-dtype.yml b/.github/workflows/compile_t4-dtype.yml index e0e3259c2..2a6980cb5 100644 --- a/.github/workflows/compile_t4-dtype.yml +++ b/.github/workflows/compile_t4-dtype.yml @@ -68,7 +68,7 @@ jobs: echo "******************************************" python generate.py --dtype ${DTYPE} --device cuda --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager cat ./output_eager - python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{" embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled + python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled cat ./output_compiled python export.py --dtype ${DTYPE} --device cuda --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti From cf9bea7a90d636429c53ba8417e76dba7ec29f1d Mon Sep 17 00:00:00 2001 From: Michael Gschwind Date: Wed, 17 Apr 2024 06:31:41 -0700 Subject: [PATCH 10/12] fix tests --- .../{compile-bf16.yml => compile-dtype.yml} | 5 +- .github/workflows/eager-dtype.yml | 87 +++++++++++++++++++ 2 files changed, 88 insertions(+), 4 deletions(-) rename .github/workflows/{compile-bf16.yml => compile-dtype.yml} (98%) create mode 100644 .github/workflows/eager-dtype.yml diff --git a/.github/workflows/compile-bf16.yml b/.github/workflows/compile-dtype.yml similarity index 98% rename from .github/workflows/compile-bf16.yml rename to .github/workflows/compile-dtype.yml index c255e3a60..ec99f9e3f 100644 --- a/.github/workflows/compile-bf16.yml +++ b/.github/workflows/compile-dtype.yml @@ -11,7 +11,7 @@ jobs: run-tinystories: strategy: matrix: - runner: [ubuntu-latest, macos-14, macos-12] + runner: [ubuntu-latest, macos-14] runs-on: ${{matrix.runner}} steps: - name: Checkout repo @@ -102,9 +102,6 @@ jobs: echo "******************************************" echo "******** INT4 group-wise quantized *******" echo "******************************************" - if [ ${DTYPE} == float16 ]; then - DTYPE=bfloat16 - fi python generate.py --dtype ${DTYPE} --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager cat ./output_eager diff --git a/.github/workflows/eager-dtype.yml b/.github/workflows/eager-dtype.yml new file mode 100644 index 000000000..f8f564b7b --- /dev/null +++ b/.github/workflows/eager-dtype.yml @@ -0,0 +1,87 @@ +name: Compile-dtype main + +on: + push: + branches: + - main + pull_request: + workflow_dispatch: + +jobs: + run-tinystories: + strategy: + matrix: + runner: [macos-12] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout repo + uses: actions/checkout@v2 + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: 3.11 + - name: Print machine info + run: | + uname -a + if [ $(uname -s) == Darwin ]; then + sysctl machdep.cpu.brand_string + sysctl machdep.cpu.core_count + fi + - name: Install requirements + run: | + pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu + pip install -r requirements.txt + - name: Download checkpoints + run: | + mkdir -p checkpoints/stories15M + pushd checkpoints/stories15M + wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt + wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model + popd + - name: Run inference + run: | + export MODEL_PATH=checkpoints/stories15M/stories15M.pt + export MODEL_NAME=stories15M + export MODEL_DIR=/tmp + for DTYPE in bfloat16 float16 float32; do + # if [ $(uname -s) == Darwin ]; then + # export DTYPE=float16 + # fi + python generate.py --dtype ${DTYPE} --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager + cat ./output_eager + + echo "******************************************" + echo "******* Emb: channel-wise quantized ******" + echo "******************************************" + python generate.py --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager + cat ./output_eager + + echo "******************************************" + echo "******** Emb: group-wise quantized *******" + echo "******************************************" + python generate.py --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager + cat ./output_eager + + echo "******************************************" + echo "******* INT8 channel-wise quantized ******" + echo "******************************************" + python generate.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager + cat ./output_eager + + echo "******************************************" + echo "******** INT8 group-wise quantized *******" + echo "******************************************" + python generate.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager + cat ./output_eager + + echo "******************************************" + echo "******** INT4 group-wise quantized *******" + echo "******************************************" + + python generate.py --dtype ${DTYPE} --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager + cat ./output_eager + + echo "tests complete for ${DTYPE}" + done + + echo "tests complete for all dtypes!" \ No newline at end of file From 4d64713fc777414821de405b91388f25e14a91bd Mon Sep 17 00:00:00 2001 From: Michael Gschwind Date: Wed, 17 Apr 2024 06:49:53 -0700 Subject: [PATCH 11/12] typo --- .github/workflows/compile_t4-dtype.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/compile_t4-dtype.yml b/.github/workflows/compile_t4-dtype.yml index 2a6980cb5..8c15eb4b1 100644 --- a/.github/workflows/compile_t4-dtype.yml +++ b/.github/workflows/compile_t4-dtype.yml @@ -79,7 +79,7 @@ jobs: echo "******************************************" python generate.py --dtype ${DTYPE} --device cuda --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager cat ./output_eager - python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{" linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled + python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled cat ./output_compiled python export.py --dtype ${DTYPE} --device cuda --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti From 3e48b0060dde25f6a5d7ab514173aed288ea7e72 Mon Sep 17 00:00:00 2001 From: Michael Gschwind Date: Wed, 17 Apr 2024 06:53:21 -0700 Subject: [PATCH 12/12] typo --- .github/workflows/eager-dtype.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/eager-dtype.yml b/.github/workflows/eager-dtype.yml index f8f564b7b..d73832dde 100644 --- a/.github/workflows/eager-dtype.yml +++ b/.github/workflows/eager-dtype.yml @@ -1,4 +1,4 @@ -name: Compile-dtype main +name: Eager-dtype main on: push: