Merge pull request axolotl-ai-cloud#276 from theobjectivedad/logging_…

…enhancement Logging update: added PID and formatting
mkeoliya · Jul 16, 2023 · 0caa19f · 0caa19f
2 parents e0eeb2e + 9ae7958
commit 0caa19f
Show file tree

Hide file tree

Showing 14 changed files with 124 additions and 79 deletions.
diff --git a/scripts/alpaca_json_to_jsonl.py b/scripts/alpaca_json_to_jsonl.py
@@ -15,6 +15,9 @@
  JsonToJsonlConverter,
  StdoutWriter,
 )
+from axolotl.logging_config import configure_logging
+
+configure_logging()
 
 # add src to the pythonpath so we don't need to pip install this
 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))

diff --git a/scripts/finetune.py b/scripts/finetune.py
@@ -17,6 +17,7 @@
 from optimum.bettertransformer import BetterTransformer
 from transformers import GenerationConfig, TextStreamer
 
+from axolotl.logging_config import configure_logging
 from axolotl.utils.data import load_prepare_datasets, load_pretraining_dataset
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_model, load_tokenizer
@@ -29,8 +30,10 @@
 src_dir = os.path.join(project_root, "src")
 sys.path.insert(0, src_dir)
 
+configure_logging()
+LOG = logging.getLogger("axolotl.scripts")
+
 
-logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO"))
 DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"
 
 
@@ -212,7 +215,7 @@ def train(
 
  # load the tokenizer first
  tokenizer_config = cfg.tokenizer_config or cfg.base_model_config
- logging.info(f"loading tokenizer... {tokenizer_config}")
+ LOG.info(f"loading tokenizer... {tokenizer_config}")
  tokenizer = load_tokenizer(tokenizer_config, cfg.tokenizer_type, cfg)
 
  if (
@@ -234,7 +237,7 @@ def train(
  eval_dataset = None
 
  if cfg.debug or "debug" in kwargs:
- logging.info("check_dataset_labels...")
+ LOG.info("check_dataset_labels...")
  check_dataset_labels(
  train_dataset.select(
  [random.randrange(0, len(train_dataset) - 1) for _ in range(5)] # nosec
@@ -243,11 +246,11 @@ def train(
  )
 
  if prepare_ds_only:
- logging.info("Finished preparing dataset. Exiting...")
+ LOG.info("Finished preparing dataset. Exiting...")
  return
 
  # Load the model and tokenizer
- logging.info("loading model and peft_config...")
+ LOG.info("loading model and peft_config...")
  model, peft_config = load_model(
  cfg.base_model,
  cfg.base_model_config,
@@ -258,17 +261,17 @@ def train(
  )
 
  if "merge_lora" in kwargs and cfg.adapter is not None:
- logging.info("running merge of LoRA with base model")
+ LOG.info("running merge of LoRA with base model")
  model = model.merge_and_unload()
  model.to(dtype=torch.float16)
 
  if cfg.local_rank == 0:
- logging.info("saving merged model")
+ LOG.info("saving merged model")
  model.save_pretrained(str(Path(cfg.output_dir) / "merged"))
  return
 
  if cfg.inference:
- logging.info("calling do_inference function")
+ LOG.info("calling do_inference function")
  prompter: Optional[str] = "AlpacaPrompter"
  if "prompter" in kwargs:
  if kwargs["prompter"] == "None":
@@ -287,12 +290,12 @@ def train(
  model.config.use_cache = False
 
  if torch.__version__ >= "2" and sys.platform != "win32":
- logging.info("Compiling torch model")
+ LOG.info("Compiling torch model")
  model = torch.compile(model)
 
  # go ahead and presave, so we have the adapter config available to inspect
  if peft_config:
- logging.info(f"Pre-saving adapter config to {cfg.output_dir}")
+ LOG.info(f"Pre-saving adapter config to {cfg.output_dir}")
  peft_config.save_pretrained(cfg.output_dir)
 
  # In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
@@ -308,9 +311,9 @@ def terminate_handler(_, __, model):
  signal.SIGINT, lambda signum, frame: terminate_handler(signum, frame, model)
  )
 
- logging.info("Starting trainer...")
+ LOG.info("Starting trainer...")
  if cfg.group_by_length:
- logging.info("hang tight... sorting dataset for group_by_length")
+ LOG.info("hang tight... sorting dataset for group_by_length")
  resume_from_checkpoint = cfg.resume_from_checkpoint
  if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
  possible_checkpoints = [
@@ -322,7 +325,7 @@ def terminate_handler(_, __, model):
  key=lambda path: int(path.split("-")[-1]),
  )
  resume_from_checkpoint = sorted_paths[-1]
- logging.info(
+ LOG.info(
  f"Using Auto-resume functionality to start with checkpoint at {resume_from_checkpoint}"
  )
 
@@ -336,7 +339,7 @@ def terminate_handler(_, __, model):
  else:
  trainer.train(resume_from_checkpoint=resume_from_checkpoint)
 
- logging.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")
+ LOG.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")
 
  # TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
  # only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file

diff --git a/src/axolotl/datasets.py b/src/axolotl/datasets.py
@@ -14,6 +14,8 @@
 # let's check to ensure we don't truncate an item in the middle, we'll use
 # the collators later on to pad the datasets
 
+LOG = logging.getLogger("axolotl")
+
 
 class TokenizedPromptDataset(IterableDataset):
  """
@@ -115,7 +117,7 @@ def __iter__(self):
  "attention_mask": attention_mask,
  }
  else:
- logging.warning(
+ LOG.warning(
  f"dropping batch due to tensor size mismatch input_ids: {input_ids.size()}, labels: {labels.size()}, attention_mask: {attention_mask.size()}"
  )
  buffer = {

diff --git a/src/axolotl/logging_config.py b/src/axolotl/logging_config.py
@@ -0,0 +1,30 @@
+"""Logging configuration settings"""
+
+import os
+import sys
+from logging.config import dictConfig
+from typing import Any, Dict
+
+DEFAULT_LOGGING_CONFIG: Dict[str, Any] = {
+ "version": 1,
+ "formatters": {
+ "simple": {
+ "format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d] %(message)s",
+ },
+ },
+ "filters": {},
+ "handlers": {
+ "console": {
+ "class": "logging.StreamHandler",
+ "formatter": "simple",
+ "filters": [],
+ "stream": sys.stdout,
+ },
+ },
+ "root": {"handlers": ["console"], "level": os.getenv("LOG_LEVEL", "INFO")},
+}
+
+
+def configure_logging():
+ """Configure with default logging"""
+ dictConfig(DEFAULT_LOGGING_CONFIG)
diff --git a/src/axolotl/monkeypatch/llama_landmark_attn.py b/src/axolotl/monkeypatch/llama_landmark_attn.py
@@ -53,7 +53,7 @@
  replace_return_docstrings,
 )
 
-logger = logging.get_logger(__name__)
+LOG = logging.getLogger("axolotl")
 
 _CONFIG_FOR_DOC = "LlamaConfig"
 
@@ -862,7 +862,7 @@ def forward(
 
  if self.gradient_checkpointing and self.training:
  if use_cache:
- logger.warning_once(
+ LOG.warning_once(
  "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
  )
  use_cache = False

diff --git a/src/axolotl/prompt_strategies/pygmalion.py b/src/axolotl/prompt_strategies/pygmalion.py
@@ -11,6 +11,8 @@
  tokenize_prompt_default,
 )
 
+LOG = logging.getLogger("axolotl")
+
 IGNORE_TOKEN_ID = -100
 
 
@@ -64,7 +66,7 @@ def tokenize_prompt(self, prompt):
  *copy.deepcopy(res["input_ids"])
  ][len(self.bot_prefix_token_ids) :]
  else:
- logging.warning(f"unknown role in conversation: {role}")
+ LOG.warning(f"unknown role in conversation: {role}")
  res = defaultdict(lambda: [])
 
  # pylint: disable=duplicate-code

diff --git a/src/axolotl/prompt_tokenizers.py b/src/axolotl/prompt_tokenizers.py
@@ -10,6 +10,8 @@
 
 from axolotl.prompters import IGNORE_TOKEN_ID
 
+LOG = logging.getLogger("axolotl")
+
 IGNORE_INDEX = -100
 LLAMA_DEFAULT_PAD_TOKEN = "[PAD]" # nosec
 LLAMA_DEFAULT_EOS_TOKEN = "</s>" # nosec
@@ -384,7 +386,7 @@ def tokenize_prompt(self, prompt):
  # everything from this is masked out from the labels
  labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
  else:
- logging.warning(f"unhandled role: {part[0]}")
+ LOG.warning(f"unhandled role: {part[0]}")
 
  # pylint: disable=duplicate-code
  result, current_len = parse_tokenized_to_result(

diff --git a/src/axolotl/prompters.py b/src/axolotl/prompters.py
@@ -5,6 +5,7 @@
 from enum import Enum, auto
 from typing import Generator, List, Optional, Tuple, Union
 
+LOG = logging.getLogger("axolotl")
 IGNORE_TOKEN_ID = -100
 
 
@@ -241,7 +242,7 @@ def get_prompt(self) -> Generator[Tuple[str, str], None, None]:
  if message:
  yield (role + ":", " " + message)
  else:
- logging.warning(f"role with empty message: {role}")
+ LOG.warning(f"role with empty message: {role}")
  yield (role + ":", "")
 
  def copy(self):

diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py
@@ -35,6 +35,8 @@
  SummarizeTLDRPrompter,
 )
 
+LOG = logging.getLogger("axolotl")
+
 
 def load_tokenized_prepared_datasets(
  tokenizer, cfg, default_dataset_prepared_path
@@ -73,17 +75,17 @@ def load_tokenized_prepared_datasets(
  if dataset:
  ...
  elif any(prepared_ds_path.glob("*")):
- logging.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
+ LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
  dataset = load_from_disk(str(prepared_ds_path))
- logging.info("Prepared dataset loaded from disk...")
+ LOG.info("Prepared dataset loaded from disk...")
  else:
- logging.info(f"Unable to find prepared dataset in {prepared_ds_path}")
- logging.info("Loading raw datasets...")
+ LOG.info(f"Unable to find prepared dataset in {prepared_ds_path}")
+ LOG.info("Loading raw datasets...")
 
  if cfg.seed:
  seed = cfg.seed
  else:
- logging.info("No seed provided, using default seed of 42")
+ LOG.info("No seed provided, using default seed of 42")
  seed = 42
 
  datasets = []
@@ -255,25 +257,21 @@ def load_tokenized_prepared_datasets(
  suffix = ""
  if ":load_" in d.type:
  suffix = f" Did you mean {d.type.replace(':load_', '.load_')}?"
- logging.error(
- f"unhandled prompt tokenization strategy: {d.type}. {suffix}"
- )
+ LOG.error(f"unhandled prompt tokenization strategy: {d.type}. {suffix}")
  raise ValueError(
  f"unhandled prompt tokenization strategy: {d.type} {suffix}"
  )
- logging.info("tokenizing, merging, and shuffling master dataset")
+ LOG.info("tokenizing, merging, and shuffling master dataset")
 
  samples: List[int] = []
  for d in datasets:
  samples = samples + list(d)
  dataset = Dataset.from_list(samples).shuffle(seed=seed)
  if cfg.local_rank == 0:
- logging.info(
- f"Saving merged prepared dataset to disk... {prepared_ds_path}"
- )
+ LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
  dataset.save_to_disk(prepared_ds_path)
  if cfg.push_dataset_to_hub:
- logging.info(
+ LOG.info(
  f"Saving merged prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
  )
  dataset.push_to_hub(
@@ -324,7 +322,7 @@ def load_prepare_datasets(
  use_auth_token = cfg.hf_use_auth_token
  try:
  if cfg.push_dataset_to_hub:
- logging.info(
+ LOG.info(
  f"Checking for packed prepared dataset from hub... {cfg.push_dataset_to_hub}/{ds_hash}"
  )
  dataset = load_dataset(
@@ -338,13 +336,13 @@ def load_prepare_datasets(
  if dataset:
  ...
  elif any(prepared_ds_path.glob("*")):
- logging.info(
+ LOG.info(
  f"Loading prepared packed dataset from disk at {prepared_ds_path}..."
  )
  dataset = load_from_disk(str(prepared_ds_path))
- logging.info("Prepared packed dataset loaded from disk...")
+ LOG.info("Prepared packed dataset loaded from disk...")
  if cfg.push_dataset_to_hub:
- logging.info(
+ LOG.info(
  f"Saving packed prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
  )
  dataset.push_to_hub(
@@ -363,9 +361,7 @@ def load_prepare_datasets(
  [dataset],
  seq_length=max_packed_sequence_len,
  )
- logging.info(
- f"packing master dataset to len: {cfg.max_packed_sequence_len}"
- )
+ LOG.info(f"packing master dataset to len: {cfg.max_packed_sequence_len}")
  dataset = Dataset.from_list(list(constant_len_dataset))
 
  # filter out bad data
@@ -381,12 +377,12 @@ def load_prepare_datasets(
  )
 
  if cfg.local_rank == 0:
- logging.info(
+ LOG.info(
  f"Saving packed prepared dataset to disk... {prepared_ds_path}"
  )
  dataset.save_to_disk(prepared_ds_path)
  if cfg.push_dataset_to_hub:
- logging.info(
+ LOG.info(
  f"Saving packed prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
  )
  dataset.push_to_hub(
@@ -399,7 +395,7 @@ def load_prepare_datasets(
  )
 
  if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
- logging.info(
+ LOG.info(
  f"Using index #{cfg.dataset_shard_idx} of {cfg.dataset_shard_num} shards"
  )
  dataset = dataset.shard(
@@ -520,7 +516,7 @@ def encode_pretraining(tokenizer, max_tokens, examples):
  "attention_mask": [seq.tolist() for seq in new_attention_mask],
  }
 
- logging.debug(len(ret["input_ids"]))
+ LOG.debug(len(ret["input_ids"]))
  return ret