diff --git a/examples/mistral/qlora.yml b/examples/mistral/qlora.yml
index 5a131c5f3..9c64a8c2d 100644
--- a/examples/mistral/qlora.yml
+++ b/examples/mistral/qlora.yml
@@ -36,10 +36,10 @@ lora_target_modules:
- k_proj
- o_proj
-wandb_project:
-wandb_entity:
+wandb_project:
+wandb_entity:
wandb_watch:
-wandb_run_id:
+wandb_run_id:
wandb_log_model:
gradient_accumulation_steps: 4
@@ -76,4 +76,4 @@ fsdp_config:
special_tokens:
bos_token: ""
eos_token: ""
- unk_token: ""
\ No newline at end of file
+ unk_token: ""
diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index aa6049bd3..2c60f00c2 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -81,7 +81,8 @@ def load_tokenizer(cfg):
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
os.environ["TOKENIZERS_PARALLELISM"] = "false"
- if cfg.is_mistral_derived_model:
+ # Mistral's official FA implementation requires left padding
+ if cfg.is_mistral_derived_model and cfg.flash_attention and not cfg.sample_packing:
tokenizer.padding_side = "left"
if cfg.special_tokens: