From db7de580589d6544e062822e06805553e10fc749 Mon Sep 17 00:00:00 2001 From: Tirth Patel Date: Thu, 22 Feb 2024 23:16:31 +0000 Subject: [PATCH 1/3] Fix Mistral memory consumption with JAX and default dtype bug --- keras_nlp/models/mistral/mistral_causal_lm.py | 1 + keras_nlp/models/mistral/mistral_presets.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/keras_nlp/models/mistral/mistral_causal_lm.py b/keras_nlp/models/mistral/mistral_causal_lm.py index 3296bb949..e9bd4e561 100644 --- a/keras_nlp/models/mistral/mistral_causal_lm.py +++ b/keras_nlp/models/mistral/mistral_causal_lm.py @@ -190,6 +190,7 @@ def next(prompt, cache, index): mask=padding_mask, end_token_id=end_token_id, hidden_states=hidden_states, + model=self, ) # Compute an output padding mask with the token ids we updated. diff --git a/keras_nlp/models/mistral/mistral_presets.py b/keras_nlp/models/mistral/mistral_presets.py index 82a2ec44f..67902ac37 100644 --- a/keras_nlp/models/mistral/mistral_presets.py +++ b/keras_nlp/models/mistral/mistral_presets.py @@ -23,7 +23,7 @@ "path": "mistral", "model_card": "https://github.com/mistralai/mistral-src/blob/main/README.md", }, - "kaggle_handle": "kaggle://keras/mistral/keras/mistral_7b_en/3", + "kaggle_handle": "kaggle://keras/mistral/keras/mistral_7b_en/4", }, "mistral_instruct_7b_en": { "metadata": { @@ -33,6 +33,6 @@ "path": "mistral", "model_card": "https://github.com/mistralai/mistral-src/blob/main/README.md", }, - "kaggle_handle": "kaggle://keras/mistral/keras/mistral_instruct_7b_en/3", + "kaggle_handle": "kaggle://keras/mistral/keras/mistral_instruct_7b_en/4", }, } From 3096b2583a193f82f067a7dd5298b41ee9a0e7cb Mon Sep 17 00:00:00 2001 From: Tirth Patel Date: Thu, 22 Feb 2024 23:18:43 +0000 Subject: [PATCH 2/3] Update the presets script to generate the right thing --- tools/checkpoint_conversion/convert_mistral_checkpoints.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/checkpoint_conversion/convert_mistral_checkpoints.py b/tools/checkpoint_conversion/convert_mistral_checkpoints.py index 8e10089ef..f1824663a 100644 --- a/tools/checkpoint_conversion/convert_mistral_checkpoints.py +++ b/tools/checkpoint_conversion/convert_mistral_checkpoints.py @@ -300,7 +300,7 @@ def main(_): print("-> Saved the model weights in float16") # === Save the model config === - keras_nlp_config["dtype"] = "bfloat16" + keras_nlp_config.pop("dtype") # We don't want a default dtype model_config = { "module": "keras_nlp.src.models.mistral.mistral_backbone", "class_name": "MistralBackbone", @@ -316,7 +316,7 @@ def main(_): # === Save the tokenizer config === tokenizer_config = { - "module": "keras_nlp.src.models.mistral.Mistral_tokenizer", + "module": "keras_nlp.src.models.mistral.mistral_tokenizer", "class_name": "MistralTokenizer", "config": { "name": "mistral_tokenizer", From 11b2fc07e432c333fa2b5582bd94e8322d398ac3 Mon Sep 17 00:00:00 2001 From: Tirth Patel Date: Fri, 23 Feb 2024 01:49:46 +0000 Subject: [PATCH 3/3] Use keras_nlp.utils.preset_utils to generate the presets --- keras_nlp/models/mistral/mistral_presets.py | 4 +- .../convert_mistral_checkpoints.py | 190 +++++++----------- 2 files changed, 71 insertions(+), 123 deletions(-) diff --git a/keras_nlp/models/mistral/mistral_presets.py b/keras_nlp/models/mistral/mistral_presets.py index 67902ac37..7fb4b4e0a 100644 --- a/keras_nlp/models/mistral/mistral_presets.py +++ b/keras_nlp/models/mistral/mistral_presets.py @@ -23,7 +23,7 @@ "path": "mistral", "model_card": "https://github.com/mistralai/mistral-src/blob/main/README.md", }, - "kaggle_handle": "kaggle://keras/mistral/keras/mistral_7b_en/4", + "kaggle_handle": "kaggle://keras/mistral/keras/mistral_7b_en/6", }, "mistral_instruct_7b_en": { "metadata": { @@ -33,6 +33,6 @@ "path": "mistral", "model_card": "https://github.com/mistralai/mistral-src/blob/main/README.md", }, - "kaggle_handle": "kaggle://keras/mistral/keras/mistral_instruct_7b_en/4", + "kaggle_handle": "kaggle://keras/mistral/keras/mistral_instruct_7b_en/6", }, } diff --git a/tools/checkpoint_conversion/convert_mistral_checkpoints.py b/tools/checkpoint_conversion/convert_mistral_checkpoints.py index f1824663a..7e13b9dd7 100644 --- a/tools/checkpoint_conversion/convert_mistral_checkpoints.py +++ b/tools/checkpoint_conversion/convert_mistral_checkpoints.py @@ -11,14 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import datetime import gc -import json import os -import pathlib +import shutil +import tempfile import traceback -import keras import numpy as np import requests from absl import app @@ -27,10 +25,10 @@ from transformers import AutoTokenizer from transformers import MistralForCausalLM -import keras_nlp from keras_nlp.models import MistralBackbone from keras_nlp.models import MistralCausalLMPreprocessor from keras_nlp.models import MistralTokenizer +from keras_nlp.utils.preset_utils import save_to_preset PRESET_MAP = { "mistral_7b_en": "mistralai/Mistral-7B-v0.1", @@ -227,124 +225,74 @@ def main(_): preset = FLAGS.preset hf_preset = PRESET_MAP[preset] - # === Create the save directories === - model_dir = pathlib.Path(__file__).parent / f"{preset}" - tokenizer_dir = model_dir / "assets" / "tokenizer" - if not model_dir.exists(): - os.makedirs(model_dir) - if not tokenizer_dir.exists(): - os.makedirs(tokenizer_dir) + # === Create the temporary save directories === + temp_dir = tempfile.mkdtemp() - # === Load the Huggingface model === - hf_model = MistralForCausalLM.from_pretrained(hf_preset) - hf_tokenizer = AutoTokenizer.from_pretrained(hf_preset) - hf_model.eval() - print("\n-> Huggingface model and tokenizer loaded") - - # === Load the KerasNLP model === - keras_nlp_config = dict( - vocabulary_size=hf_model.config.vocab_size, - hidden_dim=hf_model.config.hidden_size, - num_layers=hf_model.config.num_hidden_layers, - num_query_heads=hf_model.config.num_attention_heads, - num_key_value_heads=hf_model.config.num_key_value_heads, - intermediate_dim=hf_model.config.intermediate_size, - sliding_window=hf_model.config.sliding_window, - layer_norm_epsilon=hf_model.config.rms_norm_eps, - rope_max_wavelength=hf_model.config.rope_theta, - dtype="float32", - ) - keras_nlp_model = MistralBackbone(**keras_nlp_config) - - # === Download the tokenizer from Huggingface model card === - spm_path = ( - f"https://huggingface.co/{hf_preset}/resolve/main/tokenizer.model" - ) - response = requests.get(spm_path) - if not response.ok: - raise ValueError(f"Couldn't fetch {preset}'s tokenizer.") - tokenizer_path = tokenizer_dir / "vocabulary.spm" - with open(tokenizer_path, "wb") as tokenizer_file: - tokenizer_file.write(response.content) - keras_nlp_tokenizer = MistralTokenizer(str(tokenizer_path.absolute())) - print("\n-> Keras 3 model and tokenizer loaded.") - - # === Port the weights === - convert_checkpoints(keras_nlp_model, hf_model) - print("\n-> Weight transfer done.") - - # === Check that the models and tokenizers outputs match === - test_tokenizer(keras_nlp_tokenizer, hf_tokenizer) - test_model(keras_nlp_model, keras_nlp_tokenizer, hf_model, hf_tokenizer) - print("\n-> Tests passed!") - - # === Save the model weights in float32 format === - keras_nlp_model.save_weights( - str((model_dir / "model.weights.h5").absolute()) - ) - print("\n-> Saved the model weights in float16") - - del keras_nlp_model, hf_model - gc.collect() - - keras_nlp_config["dtype"] = "float16" - - # === Save the weights again in float16 === - keras_nlp_model = MistralBackbone(**keras_nlp_config) - keras_nlp_model.load_weights( - str((model_dir / "model.weights.h5").absolute()) - ) - keras_nlp_model.save_weights( - str((model_dir / "model.weights.h5").absolute()) - ) - print("-> Saved the model weights in float16") - - # === Save the model config === - keras_nlp_config.pop("dtype") # We don't want a default dtype - model_config = { - "module": "keras_nlp.src.models.mistral.mistral_backbone", - "class_name": "MistralBackbone", - "config": {**keras_nlp_config}, - "registered_name": "keras_nlp>MistralBackbone", - "assets": [], - "weights": "model.weights.h5", - } - model_config_json = json.dumps(model_config) - with open(model_dir / "config.json", "w") as model_config_file: - model_config_file.write(model_config_json) - print("\n-> Saved model config") - - # === Save the tokenizer config === - tokenizer_config = { - "module": "keras_nlp.src.models.mistral.mistral_tokenizer", - "class_name": "MistralTokenizer", - "config": { - "name": "mistral_tokenizer", - "trainable": True, - "dtype": "int32", - "proto": None, - "sequence_length": None, - }, - "registered_name": "keras_nlp>MistralTokenizer", - "assets": ["assets/tokenizer/vocabulary.spm"], - "weights": None, - } - tokenizer_config_json = json.dumps(tokenizer_config) - with open(model_dir / "tokenizer.json", "w") as tokenizer_config_file: - tokenizer_config_file.write(tokenizer_config_json) - print("\n-> Saved tokenizer config") + try: + # === Load the Huggingface model === + hf_model = MistralForCausalLM.from_pretrained(hf_preset) + hf_tokenizer = AutoTokenizer.from_pretrained(hf_preset) + hf_model.eval() + print("\n-> Huggingface model and tokenizer loaded") + + # === Load the KerasNLP model === + backbone_kwargs = dict( + vocabulary_size=hf_model.config.vocab_size, + hidden_dim=hf_model.config.hidden_size, + num_layers=hf_model.config.num_hidden_layers, + num_query_heads=hf_model.config.num_attention_heads, + num_key_value_heads=hf_model.config.num_key_value_heads, + intermediate_dim=hf_model.config.intermediate_size, + sliding_window=hf_model.config.sliding_window, + layer_norm_epsilon=hf_model.config.rms_norm_eps, + rope_max_wavelength=hf_model.config.rope_theta, + dtype="float32", + ) + keras_nlp_model = MistralBackbone(**backbone_kwargs) - # === Save metadata === - metadata_config = { - "keras_version": keras.__version__, - "keras_nlp_version": keras_nlp.__version__, - "parameter_count": keras_nlp_model.count_params(), - "date_saved": datetime.datetime.utcnow().strftime("%Y-%m-%d@%H:%M:%S"), - } - metadata_config_json = json.dumps(metadata_config) - with open(model_dir / "metadata.json", "w") as metadata_config_file: - metadata_config_file.write(metadata_config_json) - print("\n-> Saved metadata") + # === Download the tokenizer from Huggingface model card === + spm_path = ( + f"https://huggingface.co/{hf_preset}/resolve/main/tokenizer.model" + ) + response = requests.get(spm_path) + if not response.ok: + raise ValueError(f"Couldn't fetch {preset}'s tokenizer.") + tokenizer_path = os.path.join(temp_dir, "vocabulary.spm") + with open(tokenizer_path, "wb") as tokenizer_file: + tokenizer_file.write(response.content) + keras_nlp_tokenizer = MistralTokenizer(tokenizer_path) + print("\n-> Keras 3 model and tokenizer loaded.") + + # === Port the weights === + convert_checkpoints(keras_nlp_model, hf_model) + print("\n-> Weight transfer done.") + + # === Check that the models and tokenizers outputs match === + test_tokenizer(keras_nlp_tokenizer, hf_tokenizer) + test_model(keras_nlp_model, keras_nlp_tokenizer, hf_model, hf_tokenizer) + print("\n-> Tests passed!") + + # === Save the model weights in float32 format === + keras_nlp_model.save_weights(os.path.join(temp_dir, "model.weights.h5")) + print("\n-> Saved the model weights in float32") + + del keras_nlp_model, hf_model + gc.collect() + + # === Save the weights again in float16 === + backbone_kwargs["dtype"] = "float16" + keras_nlp_model = MistralBackbone(**backbone_kwargs) + keras_nlp_model.load_weights(os.path.join(temp_dir, "model.weights.h5")) + save_to_preset(keras_nlp_model, preset) + print("\n-> Saved the model preset in float16") + + # === Save the tokenizer === + save_to_preset( + keras_nlp_tokenizer, preset, config_filename="tokenizer.json" + ) + print("\n-> Saved the tokenizer") + finally: + shutil.rmtree(temp_dir) if __name__ == "__main__":