quic
diff --git a/‎QEfficient/cloud/finetune.py
+175-104 b/‎QEfficient/cloud/finetune.py
+175-104
@@ -7,6 +7,7 @@
 
 import random
 import warnings
+from typing import Optional
 
 import fire
 import numpy as np
@@ -17,13 +18,17 @@
 import torch.utils.data
 from peft import PeftModel, get_peft_model
 from torch.optim.lr_scheduler import StepLR
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from QEfficient.finetune.configs.training import train_config as TRAIN_CONFIG
+from QEfficient.finetune.configs.peft_config import LoraConfig
+from QEfficient.finetune.configs.training import TrainConfig
 from QEfficient.finetune.utils.config_utils import (
     generate_dataset_config,
     generate_peft_config,
     get_dataloader_kwargs,
+    load_config_file,
     update_config,
+    validate_config,
 )
 from QEfficient.finetune.utils.dataset_utils import (
     get_custom_data_collator,
@@ -32,114 +37,134 @@
 from QEfficient.finetune.utils.train_utils import get_longest_seq_length, print_model_size, train
 from QEfficient.utils._utils import login_and_download_hf_lm
 
+# Try importing QAIC-specific module, proceed without it if unavailable
 try:
     import torch_qaic  # noqa: F401
 except ImportError as e:
-    print(f"Warning: {e}. Moving ahead without these qaic modules.")
+    print(f"Warning: {e}. Proceeding without QAIC modules.")
 
+# Suppress all warnings for cleaner output
+warnings.filterwarnings("ignore")
 
-from transformers import AutoModelForCausalLM, AutoTokenizer
 
-# Suppress all warnings
-warnings.filterwarnings("ignore")
+def setup_distributed_training(config: TrainConfig) -> None:
+    """Initialize distributed training environment if enabled.
 
+    Args:
+        config (TrainConfig): Training configuration object.
 
-def main(**kwargs):
+    Notes:
+        - If distributed data parallel (DDP) is disabled, this function does nothing.
+        - Ensures the device is not CPU and does not specify an index for DDP compatibility.
+        - Initializes the process group using the specified distributed backend.
+
+    Raises:
+        AssertionError: If device is CPU or includes an index with DDP enabled.
     """
-    Helper function to finetune the model on QAic.
+    if not config.enable_ddp:
+        return
+
+    torch_device = torch.device(config.device)
+    assert torch_device.type != "cpu", "Host doesn't support single-node DDP"
+    assert torch_device.index is None, f"DDP requires only device type, got: {torch_device}"
+
+    dist.init_process_group(backend=config.dist_backend)
+    # from here onward "qaic/cuda" will automatically map to "qaic:i/cuda:i", where i = process rank
+    getattr(torch, torch_device.type).set_device(dist.get_rank())
 
-    .. code-block:: bash
 
-        python -m QEfficient.cloud.finetune OPTIONS
+def setup_seeds(seed: int) -> None:
+    """Set random seeds across libraries for reproducibility.
 
+    Args:
+        seed (int): Seed value to set for random number generators.
+
+    Notes:
+        - Sets seeds for PyTorch, Python's random module, and NumPy.
     """
-    # update the configuration for the training process
-    train_config = TRAIN_CONFIG()
-    update_config(train_config, **kwargs)
-    device = train_config.device
+    torch.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
 
-    # dist init
-    if train_config.enable_ddp:
-        # TODO: may have to init qccl backend, next try run with torchrun command
-        torch_device = torch.device(device)
-        assert torch_device.type != "cpu", "Host doesn't support single-node DDP"
-        assert torch_device.index is None, (
-            f"DDP requires specification of device type only, however provided device index as well: {torch_device}"
-        )
-        dist.init_process_group(backend=train_config.dist_backend)
-        # from here onward "qaic/cuda" will automatically map to "qaic:i/cuda:i", where i = process rank
-        getattr(torch, torch_device.type).set_device(dist.get_rank())
-
-    # Set the seeds for reproducibility
-    torch.manual_seed(train_config.seed)
-    random.seed(train_config.seed)
-    np.random.seed(train_config.seed)
-
-    # Load the pre-trained model and setup its configuration
-    # config = AutoConfig.from_pretrained(train_config.model_name)
-    pretrained_model_path = login_and_download_hf_lm(train_config.model_name)
+
+def load_model_and_tokenizer(config: TrainConfig) -> tuple[AutoModelForCausalLM, AutoTokenizer]:
+    """Load the pre-trained model and tokenizer from Hugging Face.
+
+    Args:
+        config (TrainConfig): Training configuration object containing model and tokenizer names.
+
+    Returns:
+        tuple: A tuple containing the loaded model (AutoModelForCausalLM) and tokenizer (AutoTokenizer).
+
+    Notes:
+        - Downloads the model if not already cached using login_and_download_hf_lm.
+        - Configures the model with FP16 precision and disables caching for training.
+        - Resizes model embeddings if tokenizer vocab size exceeds model embedding size.
+        - Sets pad_token_id to eos_token_id if not defined in the tokenizer.
+    """
+    pretrained_model_path = login_and_download_hf_lm(config.model_name)
     model = AutoModelForCausalLM.from_pretrained(
         pretrained_model_path,
         use_cache=False,
         attn_implementation="sdpa",
         torch_dtype=torch.float16,
     )
 
-    # Load the tokenizer and add special tokens
     tokenizer = AutoTokenizer.from_pretrained(
-        train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name
+        config.model_name if config.tokenizer_name is None else config.tokenizer_name
     )
     if not tokenizer.pad_token_id:
         tokenizer.pad_token_id = tokenizer.eos_token_id
 
-    # If there is a mismatch between tokenizer vocab size and embedding matrix,
-    # throw a warning and then expand the embedding matrix
     if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
-        print("WARNING: Resizing the embedding matrix to match the tokenizer vocab size.")
+        print("WARNING: Resizing embedding matrix to match tokenizer vocab size.")
         model.resize_token_embeddings(len(tokenizer))
 
-    print_model_size(model, train_config)
+    return model, tokenizer
 
-    # print the datatype of the model parameters
-    # print(get_parameter_dtypes(model))
-
-    if train_config.use_peft:
-        # Load the pre-trained peft model checkpoint and setup its configuration
-        if train_config.from_peft_checkpoint:
-            model = PeftModel.from_pretrained(model, train_config.from_peft_checkpoint, is_trainable=True)
-            peft_config = model.peft_config
-        # Generate the peft config and start fine-tuning from original model
-        else:
-            peft_config = generate_peft_config(train_config, kwargs)
-            model = get_peft_model(model, peft_config)
-        model.print_trainable_parameters()
-
-    # Get the dataset utils
-    dataset_config = generate_dataset_config(train_config, kwargs)
-    dataset_processer = tokenizer
 
-    # Load and preprocess the dataset for training and validation
-    dataset_train = get_preprocessed_dataset(
-        dataset_processer, dataset_config, split="train", context_length=train_config.context_length
-    )
+def apply_peft(model: AutoModelForCausalLM, train_config: TrainConfig, lora_config: LoraConfig) -> PeftModel:
+    """Apply Parameter-Efficient Fine-Tuning (PEFT) to the model if enabled."""
+    if not train_config.use_peft:
+        return model
 
-    dataset_val = get_preprocessed_dataset(
-        dataset_processer, dataset_config, split="test", context_length=train_config.context_length
-    )
+    if train_config.from_peft_checkpoint:
+        return PeftModel.from_pretrained(model, train_config.from_peft_checkpoint, is_trainable=True)
+
+    # Generate PEFT-compatible config from custom LoraConfig
+    peft_config = generate_peft_config(train_config, lora_config)
+    model = get_peft_model(model, peft_config)
+    model.print_trainable_parameters()
+    return model
+
+
+def setup_dataloaders(
+    train_config: TrainConfig, dataset_config, tokenizer: AutoTokenizer, dataset_train, dataset_val
+) -> tuple[torch.utils.data.DataLoader, Optional[torch.utils.data.DataLoader]]:
+    """Set up training and validation DataLoaders.
+
+    Args:
+        train_config (TrainConfig): Training configuration object.
+        dataset_config: Configuration for the dataset (generated from train_config).
+        tokenizer (AutoTokenizer): Tokenizer for preprocessing data.
+        dataset_train: Preprocessed training dataset.
+        dataset_val: Preprocessed validation dataset.
 
-    # TODO: vbaddi, check if its necessary to do this?
-    # dataset_train = ConcatDataset(
-    #             dataset_train, chunk_size=train_config.context_length
-    #         )
-    ##
-    train_dl_kwargs = get_dataloader_kwargs(train_config, dataset_train, dataset_processer, "train")
-    print("length of dataset_train", len(dataset_train))
-    custom_data_collator = get_custom_data_collator(dataset_processer, dataset_config)
+    Returns:
+        tuple: A tuple of (train_dataloader, eval_dataloader), where eval_dataloader is None if validation is disabled.
+
+    Raises:
+        ValueError: If validation is enabled but the validation set is too small.
+
+    Notes:
+        - Applies a custom data collator if provided by get_custom_data_collator.
+        - Configures DataLoader kwargs using get_dataloader_kwargs for train and val splits.
+    """
+    custom_data_collator = get_custom_data_collator(tokenizer, dataset_config)
+    train_dl_kwargs = get_dataloader_kwargs(train_config, dataset_train, tokenizer, "train")
     if custom_data_collator:
-        print("custom_data_collator is used")
         train_dl_kwargs["collate_fn"] = custom_data_collator
 
-    # Create DataLoaders for the training and validation dataset
     train_dataloader = torch.utils.data.DataLoader(
         dataset_train,
         num_workers=train_config.num_workers_dataloader,
@@ -150,12 +175,7 @@ def main(**kwargs):
 
     eval_dataloader = None
     if train_config.run_validation:
-        # if train_config.batching_strategy == "packing":
-        #     dataset_val = ConcatDataset(
-        #         dataset_val, chunk_size=train_config.context_length
-        #     )
-
-        val_dl_kwargs = get_dataloader_kwargs(train_config, dataset_val, dataset_processer, "val")
+        val_dl_kwargs = get_dataloader_kwargs(train_config, dataset_val, tokenizer, "val")
         if custom_data_collator:
             val_dl_kwargs["collate_fn"] = custom_data_collator
 
@@ -165,37 +185,90 @@ def main(**kwargs):
             pin_memory=True,
             **val_dl_kwargs,
         )
+        print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
         if len(eval_dataloader) == 0:
-            raise ValueError(
-                f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})"
-            )
-        else:
-            print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
-
-        longest_seq_length, _ = get_longest_seq_length(
-            torch.utils.data.ConcatDataset([train_dataloader.dataset, eval_dataloader.dataset])
-        )
-    else:
-        longest_seq_length, _ = get_longest_seq_length(train_dataloader.dataset)
+            raise ValueError("Eval set too small to load even one batch.")
+
+    return train_dataloader, eval_dataloader
+
 
+def main(
+    model_name: str = None,
+    tokenizer_name: str = None,
+    batch_size_training: int = None,
+    lr: float = None,
+    peft_config_file: str = None,
+    **kwargs,
+) -> None:
+    """
+    Fine-tune a model on QAIC hardware with configurable training and LoRA parameters.
+
+    Args:
+        model_name (str, optional): Override default model name.
+        tokenizer_name (str, optional): Override default tokenizer name.
+        batch_size_training (int, optional): Override default training batch size.
+        lr (float, optional): Override default learning rate.
+        peft_config_file (str, optional): Path to YAML/JSON file containing PEFT (LoRA) config.
+        **kwargs: Additional arguments to override TrainConfig.
+
+    Example:
+        .. code-block:: bash
+
+            # Using a YAML config file for PEFT
+            python -m QEfficient.cloud.finetune \\
+                --model_name "meta-llama/Llama-3.2-1B" \\
+                --lr 5e-4 \\
+                --peft_config_file "lora_config.yaml"
+
+            # Using default LoRA config
+            python -m QEfficient.cloud.finetune \\
+                --model_name "meta-llama/Llama-3.2-1B" \\
+                --lr 5e-4
+    """
+    train_config = TrainConfig()
+    #  local_args = {k: v for k, v in locals().items() if v is not None and k != "peft_config_file" and k != "kwargs"}
+    update_config(train_config, **kwargs)
+
+    lora_config = LoraConfig()
+    if peft_config_file:
+        peft_config_data = load_config_file(peft_config_file)
+        validate_config(peft_config_data, config_type="lora")
+        lora_config = LoraConfig(**peft_config_data)
+
+    setup_distributed_training(train_config)
+    setup_seeds(train_config.seed)
+    model, tokenizer = load_model_and_tokenizer(train_config)
+    print_model_size(model, train_config)
+    model = apply_peft(model, train_config, lora_config)
+
+    # Pass an empty dict instead of kwargs to avoid irrelevant parameters
+    dataset_config = generate_dataset_config(train_config, kwargs)
+    dataset_train = get_preprocessed_dataset(
+        tokenizer, dataset_config, split="train", context_length=train_config.context_length
+    )
+    dataset_val = get_preprocessed_dataset(
+        tokenizer, dataset_config, split="test", context_length=train_config.context_length
+    )
+    train_dataloader, eval_dataloader = setup_dataloaders(
+        train_config, dataset_config, tokenizer, dataset_train, dataset_val
+    )
+    dataset_for_seq_length = (
+        torch.utils.data.ConcatDataset([train_dataloader.dataset, eval_dataloader.dataset])
+        if train_config.run_validation
+        else train_dataloader.dataset
+    )
+    longest_seq_length, _ = get_longest_seq_length(dataset_for_seq_length)
     print(
-        f"The longest sequence length in the train data is {longest_seq_length}, "
-        f"passed context length is {train_config.context_length} and overall model's context length is "
-        f"{model.config.max_position_embeddings}"
+        f"Longest sequence length: {longest_seq_length}, "
+        f"Context length: {train_config.context_length}, "
+        f"Model max context: {model.config.max_position_embeddings}"
     )
     model.to(train_config.device)
-    optimizer = optim.AdamW(
-        model.parameters(),
-        lr=train_config.lr,
-        weight_decay=train_config.weight_decay,
-    )
+    optimizer = optim.AdamW(model.parameters(), lr=train_config.lr, weight_decay=train_config.weight_decay)
     scheduler = StepLR(optimizer, step_size=1, gamma=train_config.gamma)
-
-    # wrap model with DDP
     if train_config.enable_ddp:
         model = nn.parallel.DistributedDataParallel(model, device_ids=[dist.get_rank()])
-
-    _ = train(
+    train(
         model,
         train_dataloader,
         eval_dataloader,
@@ -208,8 +281,6 @@ def main(**kwargs):
         dist.get_rank() if train_config.enable_ddp else None,
         None,
     )
-
-    # finalize torch distributed
     if train_config.enable_ddp:
         dist.destroy_process_group()