From 669f1d052c996a6b6c12bd146e15be30edb9be9d Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Sat, 7 Oct 2023 01:33:43 +0900 Subject: [PATCH] Fix: Higher vram usage for mistral and sample_packing (#691) * Fix: Higher vram usage for mistral and sample_packing * chore: update comment * chore: lint --- examples/mistral/qlora.yml | 8 ++++---- src/axolotl/utils/models.py | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/mistral/qlora.yml b/examples/mistral/qlora.yml index 5a131c5f3..9c64a8c2d 100644 --- a/examples/mistral/qlora.yml +++ b/examples/mistral/qlora.yml @@ -36,10 +36,10 @@ lora_target_modules: - k_proj - o_proj -wandb_project: -wandb_entity: +wandb_project: +wandb_entity: wandb_watch: -wandb_run_id: +wandb_run_id: wandb_log_model: gradient_accumulation_steps: 4 @@ -76,4 +76,4 @@ fsdp_config: special_tokens: bos_token: "" eos_token: "" - unk_token: "" \ No newline at end of file + unk_token: "" diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index aa6049bd3..2c60f00c2 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -81,7 +81,8 @@ def load_tokenizer(cfg): tokenizer.add_special_tokens({"pad_token": "[PAD]"}) os.environ["TOKENIZERS_PARALLELISM"] = "false" - if cfg.is_mistral_derived_model: + # Mistral's official FA implementation requires left padding + if cfg.is_mistral_derived_model and cfg.flash_attention and not cfg.sample_packing: tokenizer.padding_side = "left" if cfg.special_tokens: