axolotl-ai-cloud · winglian · May 30, 2023 · May 28, 2023 · May 28, 2023 · May 28, 2023
diff --git a/.bandit b/.bandit
@@ -0,0 +1,3 @@
+[bandit]
+exclude = tests
+skips = B101
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,5 @@
+[flake8]
+max-line-length = 88
+
+select = C,E,F,W,B,B950
+extend-ignore = E203, E501, W503
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
@@ -0,0 +1,16 @@
+name: pre-commit
+
+on:
+  pull_request:
+  push:
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - uses: actions/setup-python@v4
+      with:
+          python-version: "3.9"
+          cache: 'pip' # caching pip dependencies
+    - uses: pre-commit/action@v3.0.0
diff --git a/.gitignore b/.gitignore
@@ -160,4 +160,4 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-.idea/
+.idea/
diff --git a/.isort.cfg b/.isort.cfg
@@ -0,0 +1,2 @@
+[settings]
+profile=black
diff --git a/.mypy.ini b/.mypy.ini
@@ -0,0 +1,33 @@
+[mypy]
+
+exclude = venv
+
+[mypy-alpaca_lora_4bit.*]
+ignore_missing_imports = True
+
+[mypy-flash_attn.*]
+ignore_missing_imports = True
+
+[mypy-huggingface_hub]
+ignore_missing_imports = True
+
+[mypy-transformers.*]
+ignore_missing_imports = True
+
+[mypy-peft]
+ignore_missing_imports = True
+
+[mypy-bitsandbytes]
+ignore_missing_imports = True
+
+[mypy-datasets]
+ignore_missing_imports = True
+
+[mypy-fire]
+ignore_missing_imports = True
+
+[mypy-setuptools]
+ignore_missing_imports = True
+
+[mypy-addict]
+ignore_missing_imports = True
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,42 @@
+default_language_version:
+    python: python3.9
+
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+    -   id: check-yaml
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+-   repo: https://github.com/psf/black
+    rev: 23.3.0
+    hooks:
+    -   id: black
+-   repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+-   repo: https://github.com/PyCQA/flake8
+    rev: 6.0.0
+    hooks:
+    - id: flake8
+-   repo: https://github.com/PyCQA/pylint
+    rev: v2.17.4
+    hooks:
+    - id: pylint
+-   repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.3.0
+    hooks:
+    - id: mypy
+      additional_dependencies:
+        [
+            'types-PyYAML',
+        ]
+-   repo: https://github.com/PyCQA/bandit
+    rev: 1.7.5
+    hooks:
+    -   id: bandit
+        args: [
+            '--ini',
+            '.bandit',
+        ]
diff --git a/.pylintrc b/.pylintrc
@@ -0,0 +1,14 @@
+[MASTER]
+init-hook="from pylint.config import find_pylintrc; import os, sys; sys.path.append(os.path.dirname(find_pylintrc()))"
+
+[TYPECHECK]
+
+# List of members which are set dynamically and missed by Pylint inference
+# system, and so shouldn't trigger E1101 when accessed.
+generated-members=numpy.*, torch.*
+
+
+[pylint.messages_control]
+disable=missing-function-docstring, line-too-long, import-error,
+    too-many-arguments, too-many-locals, too-many-statements, too-many-branches, too-few-public-methods,
+    too-many-instance-attributes, fixme, import-outside-toplevel, logging-fstring-interpolation,
diff --git a/README.md b/README.md
@@ -9,6 +9,8 @@
     <p>
       Go ahead and axolotl questions!!
     </p>
+    <img src="https://github.com/OpenAccess-AI-Collective/axolotl/actions/workflows/pre-commit.yml/badge.svg?branch=main" alt="pre-commit">
+    <img alt="PyTest Status" src="https://github.com/OpenAccess-AI-Collective/axolotl/actions/workflows/tests.yml/badge.svg?branch=main">
   </div>
 </div>
 
@@ -406,3 +408,12 @@ Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we can help you
 Bugs? Please check for open issue else create a new [Issue](https://github.com/OpenAccess-AI-Collective/axolotl/issues/new).
 
 PRs are **greatly welcome**!
+
+Please run below to setup env
+```bash
+pip3 install -r requirements-dev.txt -r requirements-tests.txt
+pre-commit install
+
+# test
+pytest tests/
+```
diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base
@@ -99,4 +99,3 @@ RUN pip3 install "peft @ git+https://github.com/huggingface/peft.git@main" \
     pip3 install awscli && \
     # The base image ships with `pydantic==1.8.2` which is not working
     pip3 install -U --no-cache-dir pydantic
-
diff --git a/examples/falcon/config-7b-lora.yml b/examples/falcon/config-7b-lora.yml
@@ -61,4 +61,3 @@ special_tokens:
   pad_token: "<|endoftext|>"
   bos_token: ">>ABSTRACT<<"
   eos_token: "<|endoftext|>"
-
diff --git a/examples/falcon/config-7b.yml b/examples/falcon/config-7b.yml
@@ -61,4 +61,3 @@ special_tokens:
   pad_token: "<|endoftext|>"
   bos_token: ">>ABSTRACT<<"
   eos_token: "<|endoftext|>"
-
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -0,0 +1,3 @@
+pre-commit
+black
+mypy
diff --git a/requirements.txt b/requirements.txt
@@ -4,7 +4,6 @@ bitsandbytes>=0.39.0
 addict
 fire
 PyYAML==6.0
-black
 datasets
 accelerate>=0.19.0
 sentencepiece

diff --git a/scripts/alpaca_json_to_jsonl.py b/scripts/alpaca_json_to_jsonl.py
@@ -1,24 +1,38 @@
+"""Module to convert json file to jsonl"""
+
 import os
 import sys
 from pathlib import Path
+from typing import Optional, Union
 
 import fire
-from typing import Optional
+
+from axolotl.convert import (
+    FileReader,
+    FileWriter,
+    JsonlSerializer,
+    JsonParser,
+    JsonToJsonlConverter,
+    StdoutWriter,
+)
 
 # add src to the pythonpath so we don't need to pip install this
 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 src_dir = os.path.join(project_root, "src")
 sys.path.insert(0, src_dir)
 
-from axolotl.convert import *
-
 
 def main(
-    input: Path,
+    file: Path,
     output: Optional[Path] = None,
     to_stdout: Optional[bool] = False,
 ):
+    """
+    Convert a json file to jsonl
+    """
+
     file_reader = FileReader()
+    writer: Union[StdoutWriter, FileWriter]
     if to_stdout or output is None:
         writer = StdoutWriter()
     else:
@@ -28,7 +42,7 @@ def main(
 
     converter = JsonToJsonlConverter(file_reader, writer, json_parser, jsonl_serializer)
 
-    converter.convert(input, output)
+    converter.convert(file, output)
 
 
 if __name__ == "__main__":

diff --git a/scripts/finetune.py b/scripts/finetune.py
@@ -1,44 +1,49 @@
+"""Prepare and train a model on a dataset. Can also infer from a model or merge lora"""
+
 import importlib
 import logging
 import os
 import random
 import signal
 import sys
 from pathlib import Path
-from typing import Optional, List, Dict, Any, Union
+from typing import Any, Dict, List, Optional, Union
 
 import fire
 import torch
 import yaml
 
+from axolotl.utils.data import load_prepare_datasets
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.models import load_model, load_tokenizer
+
 # add src to the pythonpath so we don't need to pip install this
 from axolotl.utils.tokenization import check_dataset_labels
+from axolotl.utils.trainer import setup_trainer
 from axolotl.utils.validation import validate_config
-from axolotl.utils.dict import DictDefault
+from axolotl.utils.wandb import setup_wandb_env_vars
 
 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 src_dir = os.path.join(project_root, "src")
 sys.path.insert(0, src_dir)
 
-from axolotl.utils.data import load_prepare_datasets
-from axolotl.utils.models import load_model, load_tokenizer
-from axolotl.utils.trainer import setup_trainer
-from axolotl.utils.wandb import setup_wandb_env_vars
 
 logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO"))
 DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"
 
 
 def choose_device(cfg):
     def get_device():
-        if torch.cuda.is_available():
-            return f"cuda:{cfg.local_rank}"
-        else:
-            try:
-                if torch.backends.mps.is_available():
-                    return "mps"
-            except:
-                return "cpu"
+        try:
+            if torch.cuda.is_available():
+                return f"cuda:{cfg.local_rank}"
+
+            if torch.backends.mps.is_available():
+                return "mps"
+
+            raise SystemError("No CUDA/mps device found")
+        except Exception:  # pylint: disable=broad-exception-caught
+            return "cpu"
 
     cfg.device = get_device()
     if cfg.device == "cuda":
@@ -51,7 +56,7 @@ def get_multi_line_input() -> Optional[str]:
     print("Give me an instruction (Ctrl + D to finish): ")
     instruction = ""
     for line in sys.stdin:
-        instruction += line
+        instruction += line  # pylint: disable=consider-using-join
     # instruction = pathlib.Path("/proc/self/fd/0").read_text()
     return instruction
 
@@ -92,7 +97,7 @@ def do_inference(cfg, model, tokenizer, prompter="AlpacaPrompter"):
 
 
 def choose_config(path: Path):
-    yaml_files = [file for file in path.glob("*.yml")]
+    yaml_files = list(path.glob("*.yml"))
 
     if not yaml_files:
         raise ValueError(
@@ -130,12 +135,12 @@ def train(
         config = choose_config(config)
 
     # load the config from the yaml file
-    with open(config, "r") as f:
-        cfg: DictDefault = DictDefault(yaml.load(f, Loader=yaml.Loader))
+    with open(config, encoding="utf-8") as file:
+        cfg: DictDefault = DictDefault(yaml.safe_load(file))
     # if there are any options passed in the cli, if it is something that seems valid from the yaml,
     # then overwrite the value
     cfg_keys = cfg.keys()
-    for k in kwargs:
+    for k, _ in kwargs.items():
         # if not strict, allow writing to cfg even if it's not in the yml already
         if k in cfg_keys or cfg.strict is False:
             # handle booleans
@@ -167,13 +172,11 @@ def train(
 
     # load the tokenizer first
     logging.info("loading tokenizer...")
-    tokenizer = load_tokenizer(
-        cfg.base_model_config,
-        cfg.tokenizer_type,
-        cfg
-    )
+    tokenizer = load_tokenizer(cfg.base_model_config, cfg.tokenizer_type, cfg)
 
-    if check_not_in(["inference", "shard", "merge_lora"], kwargs):  # don't need to load dataset for these
+    if check_not_in(
+        ["inference", "shard", "merge_lora"], kwargs
+    ):  # don't need to load dataset for these
         train_dataset, eval_dataset = load_prepare_datasets(
             tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
         )
@@ -182,7 +185,7 @@ def train(
         logging.info("check_dataset_labels...")
         check_dataset_labels(
             train_dataset.select(
-                [random.randrange(0, len(train_dataset) - 1) for i in range(5)]
+                [random.randrange(0, len(train_dataset) - 1) for _ in range(5)]  # nosec
             ),
             tokenizer,
         )
@@ -239,7 +242,10 @@ def train(
     if cfg.local_rank == 0:
         signal.signal(
             signal.SIGINT,
-            lambda signal, frame: (model.save_pretrained(cfg.output_dir), exit(0)),
+            lambda signal, frame: (
+                model.save_pretrained(cfg.output_dir),
+                sys.exit(0),
+            ),
         )
 
     logging.info("Starting trainer...")
@@ -252,7 +258,8 @@ def train(
         ]
         if len(possible_checkpoints) > 0:
             sorted_paths = sorted(
-                possible_checkpoints, key=lambda path: int(path.split("-")[-1])
+                possible_checkpoints,
+                key=lambda path: int(path.split("-")[-1]),
             )
             resume_from_checkpoint = sorted_paths[-1]
             logging.info(
@@ -266,6 +273,7 @@ def train(
     # only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file
     if cfg.local_rank == 0:
         model.save_pretrained(cfg.output_dir)
+
     # trainer.save_model(cfg.output_dir)  # TODO this may be needed for deepspeed to work? need to review another time
 
 

diff --git a/setup.py b/setup.py
@@ -1,7 +1,9 @@
-from setuptools import setup, find_packages
+"""setup.py for axolotl"""
+
+from setuptools import find_packages, setup
 
 install_requires = []
-with open("./requirements.txt", "r") as requirements_file:
+with open("./requirements.txt", encoding="utf-8") as requirements_file:
     # don't include peft yet until we check the int4
     # need to manually install peft for now...
     reqs = [r.strip() for r in requirements_file.readlines() if "peft" not in r]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -99,4 +99,3 @@ RUN pip3 install "peft @ git+https://github.com/huggingface/peft.git@main" \
		pip3 install awscli && \
		# The base image ships with `pydantic==1.8.2` which is not working
		pip3 install -U --no-cache-dir pydantic
Original file line number	Diff line number	Diff line change
Expand Up		@@ -61,4 +61,3 @@ special_tokens:
		pad_token: "<\|endoftext\|>"
		bos_token: ">>ABSTRACT<<"
		eos_token: "<\|endoftext\|>"