axolotl-ai-cloud · winglian · Sep 26, 2023 · Sep 23, 2023 · Sep 25, 2023 · Sep 25, 2023
diff --git a/README.md b/README.md
@@ -408,6 +408,10 @@ tokenizer_legacy:
 # this is reported to improve training speed on some models
 resize_token_embeddings_to_32x:
 
+# used to identify if the model is falcon/llama based
+is_falcon_derived_model:
+is_llama_derived_model:
+
 # whether you are training a 4-bit GPTQ quantized model
 gptq: true
 gptq_groupsize: 128 # group size

diff --git a/examples/falcon/config-7b-lora.yml b/examples/falcon/config-7b-lora.yml
@@ -3,6 +3,7 @@ base_model_config: tiiuae/falcon-7b
 trust_remote_code: true
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
+is_falcon_derived_model: true
 load_in_8bit: true
 load_in_4bit: false
 gptq: false

diff --git a/examples/falcon/config-7b-qlora.yml b/examples/falcon/config-7b-qlora.yml
@@ -6,6 +6,7 @@ base_model_config: tiiuae/falcon-7b
 trust_remote_code: true
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
+is_falcon_derived_model: true
 load_in_8bit: false
 # enable 4bit for QLoRA
 load_in_4bit: true

diff --git a/examples/falcon/config-7b.yml b/examples/falcon/config-7b.yml
@@ -3,6 +3,7 @@ base_model_config: tiiuae/falcon-7b
 trust_remote_code: true
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
+is_falcon_derived_model: true
 load_in_8bit: false
 load_in_4bit: false
 gptq: false

diff --git a/requirements.txt b/requirements.txt
@@ -4,9 +4,9 @@ torch==2.0.1
 auto-gptq
 packaging
 peft @ git+https://github.com/huggingface/peft.git
-transformers @ git+https://github.com/huggingface/transformers.git
+transformers @ git+https://github.com/huggingface/transformers.git@0ac3875011d32dc85e0e83970507e3afe8f0febb
 bitsandbytes>=0.41.1
-accelerate @ git+https://github.com/huggingface/accelerate
+accelerate @ git+https://github.com/huggingface/accelerate@80da9cfb09bb3cc9f1b385cb55d6b90d025a5fd9
 deepspeed
 addict
 evaluate

diff --git a/src/axolotl/monkeypatch/falcon_attn_hijack_flash.py b/src/axolotl/monkeypatch/falcon_attn_hijack_flash.py
diff --git a/src/axolotl/utils/config.py b/src/axolotl/utils/config.py
@@ -86,6 +86,22 @@ def normalize_config(cfg):
  or (cfg.model_type and "llama" in cfg.model_type.lower())
  )
 
+ # figure out if the model is falcon
+ cfg.is_falcon_derived_model = (
+ (
+ hasattr(model_config, "model_type")
+ and model_config.model_type
+ in [
+ "falcon",
+ "RefinedWebModel",
+ "RefinedWeb",
+ ]
+ )
+ or cfg.is_falcon_derived_model
+ or "falcon" in cfg.base_model
+ or (cfg.model_type and "rwforcausallm" in cfg.model_type.lower())
+ )
+
  log_gpu_memory_usage(LOG, "baseline", cfg.device)
 
 

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
@@ -114,25 +114,13 @@ def load_model(
 
  replace_btlm_attn_with_flash_attn(cfg.base_model)
 
- if hasattr(model_config, "model_type") and model_config.model_type in [
- "falcon",
- "RefinedWebModel",
- "RefinedWeb",
- ]:
- if cfg.flash_attention:
- from axolotl.monkeypatch.falcon_attn_hijack_flash import (
- replace_falcon_attn_with_flash_attn,
- )
-
- replace_falcon_attn_with_flash_attn()
-
- if cfg.is_llama_derived_model and cfg.flash_attention:
+ if cfg.is_llama_derived_model and cfg.flash_attention and cfg.sample_packing:
  if cfg.device not in ["mps", "cpu"] and not inference:
  from axolotl.monkeypatch.llama_attn_hijack_flash import (
  replace_llama_attn_with_flash_attn,
  )
 
- LOG.info("patching with flash attention")
+ LOG.info("patching with flash attention for sample packing")
  replace_llama_attn_with_flash_attn(packed=cfg.sample_packing)
  elif cfg.is_llama_derived_model and cfg.xformers_attention:
  from axolotl.monkeypatch.llama_attn_hijack_xformers import (
@@ -213,6 +201,10 @@ def load_model(
  bnb_4bit_use_double_quant=True,
  bnb_4bit_quant_type="nf4",
  )
+ # sample packing uses custom FA2 patch
+ if cfg.flash_attention and not cfg.sample_packing:
+ if cfg.is_llama_derived_model or cfg.is_falcon_derived_model:
+ model_kwargs["use_flash_attention_2"] = True
  try:
  if cfg.is_llama_derived_model and not cfg.trust_remote_code and not cfg.gptq:
  from transformers import LlamaForCausalLM