axolotl-ai-cloud
diff --git a/‎TODO.md
+10 b/‎TODO.md
+10
diff --git a/‎ds_config.json
+24-3 b/‎ds_config.json
+24-3
diff --git a/‎scripts/finetune.py
+25-45 b/‎scripts/finetune.py
+25-45
diff --git a/‎scripts/setup-runpod.sh
+9 b/‎scripts/setup-runpod.sh
+9
diff --git a/‎src/axolotl/prompters.py
+16-5 b/‎src/axolotl/prompters.py
+16-5
@@ -0,0 +1,10 @@
+# todo list
+
+- [] Validation of parameters for combinations that won't work
+
+
+
+## things that are known not to work
+
+- FSDP offload and gradient_checkpointing - https://github.com/pytorch/pytorch/issues/82203
+- adamw_bnb_8bit doesn't play well with FSDP offload
@@ -10,21 +10,42 @@
     "hysteresis": 2,
     "min_loss_scale": 1
   },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": "auto",
+      "betas": "auto",
+      "eps": "auto",
+      "weight_decay": "auto"
+    }
+  },
   "scheduler": {
-    "type": "OneCycle",
+    "type": "WarmupDecayLR",
     "params": {
-      "cycle_min_lr": 1e-7,
-      "cycle_max_lr": 1e-4
+      "warmup_min_lr": "auto",
+      "warmup_max_lr": "auto",
+      "warmup_num_steps": "auto",
+      "total_num_steps": "auto"
     }
   },
   "zero_optimization": {
     "stage": 2,
+    "offload_optimizer": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "offload_param": {
+      "device": "cpu",
+      "pin_memory": true
+    },
     "overlap_comm": true,
     "allgather_partitions": true,
     "allgather_bucket_size": 5e8,
     "contiguous_gradients": true,
     "reduce_bucket_size": "auto",
     "reduce_scatter": true,
+    "stage3_max_live_parameters": 0,
+    "stage3_max_reuse_distance": 0,
     "stage3_gather_16bit_weights_on_model_save": true
   },
   "gradient_accumulation_steps": "auto",
 
@@ -1,5 +1,7 @@
+import importlib
 import logging
 import os
+import pathlib
 import random
 import signal
 import sys
@@ -11,6 +13,8 @@
 from attrdict import AttrDefault
 
 # add src to the pythonpath so we don't need to pip install this
+from axolotl.utils.tokenization import check_dataset_labels
+
 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 src_dir = os.path.join(project_root, "src")
 sys.path.insert(0, src_dir)
@@ -42,48 +46,20 @@ def get_device():
         cfg.device_map = {"": cfg.device}
 
 
-def check_dataset_labels(dataset, tokenizer):
-    from termcolor import colored
-
-    # the dataset is already shuffled, so let's just check the first 5 elements
-    for idx in range(5):
-        # Get the input_ids, labels, and attention_mask from the dataset
-        input_ids = dataset[idx]["input_ids"]
-        labels = dataset[idx]["labels"]
-        attention_mask = dataset[idx]["attention_mask"]
-
-        # You can compare the input_ids and labels element-wise
-        # Remember to ignore positions with IGNORE_TOKEN_ID (if you use it) or attention_mask equal to 0
-        colored_tokens = []
-        for i, (input_id, label_id, mask) in enumerate(
-            zip(input_ids, labels, attention_mask)
-        ):
-            decoded_input_token = tokenizer.decode(input_id)
-            # Choose the color based on whether the label has the ignore value or not
-            color = (
-                "red" if label_id == -100 else ("yellow" if label_id == 0 else "green")
-            )
-            colored_token = colored(decoded_input_token, color) + colored(
-                f"({label_id}, {mask})", "white"
-            )
-            colored_tokens.append(colored_token)
-
-        logging.info(" ".join(colored_tokens))
-        logging.info("\n\n\n")
-
-
-def do_inference(cfg, model, tokenizer):
+def do_inference(cfg, model, tokenizer, prompter="AlpacaPrompter"):
     tokenizer.add_special_tokens({"unk_token": "<unk>"})
     tokenizer.add_special_tokens({"bos_token": "<s>"})
     tokenizer.add_special_tokens({"eos_token": "</s>"})
 
-    from axolotl.prompters import ReflectAlpacaPrompter
+    prompter_module = getattr(importlib.import_module("axolotl.prompters"), prompter)
 
     while True:
-        instruction = str(input("Give me an instruction: "))
+        # support for multiline inputs
+        print("Give me an instruction (Ctrl + D to finish): ")
+        instruction = pathlib.Path("/proc/self/fd/0").read_text()
         if not instruction:
             return
-        prompt = ReflectAlpacaPrompter().build_prompt(instruction=instruction)
+        prompt = prompter_module().build_prompt(instruction=instruction)
         batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
 
         model.eval()
@@ -174,8 +150,8 @@ def train(
         cfg.bf16 = False
 
     # Load the model and tokenizer
-    logging.info("loading model, tokenizer, and lora_config...")
-    model, tokenizer, lora_config = load_model(
+    logging.info("loading model, tokenizer, and peft_config...")
+    model, tokenizer, peft_config = load_model(
         cfg.base_model,
         cfg.base_model_config,
         cfg.model_type,
@@ -190,6 +166,10 @@ def train(
         do_inference(cfg, model, tokenizer)
         return
 
+    if "shard" in kwargs:
+        model.save_pretrained(cfg.output_dir)
+        return
+
     train_dataset, eval_dataset = load_prepare_datasets(
         tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
     )
@@ -199,8 +179,9 @@ def train(
         return
 
     if cfg.debug:
+        logging.info("check_dataset_labels...")
         check_dataset_labels(
-            train_dataset.select([random.randrange(0, len(train_dataset) - 1)]),
+            train_dataset.select([random.randrange(0, len(train_dataset) - 1) for i in range(5)]),
             tokenizer,
         )
 
@@ -213,9 +194,9 @@ def train(
         model = torch.compile(model)
 
     # go ahead and presave, so we have the adapter config available to inspect
-    if lora_config:
+    if peft_config:
         logging.info(f"Pre-saving adapter config to {cfg.output_dir}")
-        lora_config.save_pretrained(cfg.output_dir)
+        peft_config.save_pretrained(cfg.output_dir)
 
     # In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
     if cfg.local_rank == 0:
@@ -234,12 +215,11 @@ def train(
             logging.info(f"Using Auto-resume functionality to start with checkpoint at {resume_from_checkpoint}")
     trainer.train(resume_from_checkpoint=resume_from_checkpoint)
 
-    if cfg.local_rank == 0:
-        # TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
-        logging.info(
-            f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}"
-        )
-        model.save_pretrained(cfg.output_dir)
+    logging.info(
+        f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}"
+    )
+    # TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
+    trainer.save_model(cfg.output_dir)
 
 
 if __name__ == "__main__":
 
@@ -26,6 +26,15 @@ if [ -z "${TORCH_CUDA_ARCH_LIST}" ]; then # only set this if not set yet
     export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 fi
 
+# install flash-attn and deepspeed from pre-built wheels for this specific container b/c these take forever to install
+mkdir -p /workspace/wheels
+cd /workspace/wheels
+curl -L -O https://github.com/winglian/axolotl/raw/wheels/wheels/deepspeed-0.9.2%2B7ddc3b01-cp38-cp38-linux_x86_64.whl
+curl -L -O https://github.com/winglian/axolotl/raw/wheels/wheels/flash_attn-1.0.4-cp38-cp38-linux_x86_64.whl
+pip install deepspeed-0.9.2%2B7ddc3b01-cp38-cp38-linux_x86_64.whl
+pip install flash_attn-1.0.4-cp38-cp38-linux_x86_64.whl
+pip install "peft @ git+https://github.com/huggingface/peft.git@main" --force-reinstall --no-dependencies
+
 cd /workspace/
 git clone https://github.com/winglian/axolotl.git
 cd axolotl
 
@@ -127,7 +127,7 @@ def append_message(self, role, message):
 
 
 class ShareGPTPrompter:
-    def build_prompt(self, source, tokenizer):
+    def build_prompt(self, source, tokenizer, sequence_len=2048):
         # ignore the system prompt if provided
         if source[0]["from"] == "system":
             source.pop(0)
@@ -157,13 +157,14 @@ def build_prompt(self, source, tokenizer):
             role = roles[sentence["from"]]
             assert role == conv.roles[j % 2]
             conv.append_message(role, sentence["value"])
+        # TODO, this concatenates everything, but doesn't seem to properly add the eos_token_id, as the eos_token gets split up
         conversation = conv.get_prompt()
 
         # Tokenize conversations
         tokenized_result = tokenizer(
             conversation,
             truncation=True,
-            max_length=2048,  # FIXME
+            max_length=sequence_len,  # FIXME
             padding=False,
             return_tensors=None,
         )
@@ -173,7 +174,9 @@ def build_prompt(self, source, tokenizer):
         sep = conv.sep + conv.roles[1] + ": "
 
         rounds = conversation.split(conv.sep2)
+        rounds = [r + conv.sep2 for r in rounds]
         cur_len = 1
+        target[0] = IGNORE_TOKEN_ID  # mask out the bos
         for i, rou in enumerate(rounds):
             if rou == "":
                 break
@@ -182,19 +185,27 @@ def build_prompt(self, source, tokenizer):
             if len(parts) != 2:
                 break
             parts[0] += sep
-            round_len = len(tokenizer(rou)["input_ids"])
-            instruction_len = len(tokenizer(parts[0])["input_ids"]) - 2
+            round_len = len(tokenizer(rou)["input_ids"]) - 1  # -1 ignores the bos_token generated for this
+            # we have to strip the initial part, any dangling whitespace creates an additional ghost token
+            instruction_len = len(tokenizer(parts[0].strip())["input_ids"]) - 1  # -1 ignores the bos_token generated for this
             target[cur_len : cur_len + instruction_len] = [
                 IGNORE_TOKEN_ID
             ] * instruction_len
 
             cur_len += round_len
-        target[cur_len:] = [IGNORE_TOKEN_ID] * (len(target) - cur_len)
+            if cur_len >= sequence_len:
+                break
+
+        # Fix: Truncate the target to have the same length as input_ids
+        target = target[:len(tokenized_result["input_ids"])]
+        # target[cur_len:] = [IGNORE_TOKEN_ID] * (len(target) - cur_len)
+
         attention_mask = [
             1 if x != tokenizer.pad_token_id else 0
             for x in tokenized_result["input_ids"]
         ]
 
+        # TODO truncate len to sequence_len
         return dict(
             input_ids=tokenized_result["input_ids"],
             labels=target,