Skip to content

Commit

Permalink
Merge pull request #98 from NanoCode012/feat/pre-commit
Browse files Browse the repository at this point in the history
Add pre-commit: black+flake8+pylint+mypy+isort+bandit
  • Loading branch information
winglian authored May 30, 2023
2 parents 9d06e51 + b58f0b1 commit 1f9089d
Show file tree
Hide file tree
Showing 41 changed files with 885 additions and 382 deletions.
3 changes: 3 additions & 0 deletions .bandit
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[bandit]
exclude = tests
skips = B101
5 changes: 5 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[flake8]
max-line-length = 88

select = C,E,F,W,B,B950
extend-ignore = E203, E501, W503
16 changes: 16 additions & 0 deletions .github/workflows/pre-commit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name: pre-commit

on:
pull_request:
push:

jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: "3.9"
cache: 'pip' # caching pip dependencies
- uses: pre-commit/action@v3.0.0
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,4 +160,4 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
.idea/
2 changes: 2 additions & 0 deletions .isort.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[settings]
profile=black
33 changes: 33 additions & 0 deletions .mypy.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
[mypy]

exclude = venv

[mypy-alpaca_lora_4bit.*]
ignore_missing_imports = True

[mypy-flash_attn.*]
ignore_missing_imports = True

[mypy-huggingface_hub]
ignore_missing_imports = True

[mypy-transformers.*]
ignore_missing_imports = True

[mypy-peft]
ignore_missing_imports = True

[mypy-bitsandbytes]
ignore_missing_imports = True

[mypy-datasets]
ignore_missing_imports = True

[mypy-fire]
ignore_missing_imports = True

[mypy-setuptools]
ignore_missing_imports = True

[mypy-addict]
ignore_missing_imports = True
42 changes: 42 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
default_language_version:
python: python3.9

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
hooks:
- id: check-yaml
- id: end-of-file-fixer
- id: trailing-whitespace
- repo: https://github.com/psf/black
rev: 23.3.0
hooks:
- id: black
- repo: https://github.com/pycqa/isort
rev: 5.12.0
hooks:
- id: isort
- repo: https://github.com/PyCQA/flake8
rev: 6.0.0
hooks:
- id: flake8
- repo: https://github.com/PyCQA/pylint
rev: v2.17.4
hooks:
- id: pylint
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.3.0
hooks:
- id: mypy
additional_dependencies:
[
'types-PyYAML',
]
- repo: https://github.com/PyCQA/bandit
rev: 1.7.5
hooks:
- id: bandit
args: [
'--ini',
'.bandit',
]
14 changes: 14 additions & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[MASTER]
init-hook="from pylint.config import find_pylintrc; import os, sys; sys.path.append(os.path.dirname(find_pylintrc()))"

[TYPECHECK]

# List of members which are set dynamically and missed by Pylint inference
# system, and so shouldn't trigger E1101 when accessed.
generated-members=numpy.*, torch.*


[pylint.messages_control]
disable=missing-function-docstring, line-too-long, import-error,
too-many-arguments, too-many-locals, too-many-statements, too-many-branches, too-few-public-methods,
too-many-instance-attributes, fixme, import-outside-toplevel, logging-fstring-interpolation,
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
<p>
Go ahead and axolotl questions!!
</p>
<img src="https://github.com/OpenAccess-AI-Collective/axolotl/actions/workflows/pre-commit.yml/badge.svg?branch=main" alt="pre-commit">
<img alt="PyTest Status" src="https://github.com/OpenAccess-AI-Collective/axolotl/actions/workflows/tests.yml/badge.svg?branch=main">
</div>
</div>

Expand Down Expand Up @@ -406,3 +408,12 @@ Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we can help you
Bugs? Please check for open issue else create a new [Issue](https://github.com/OpenAccess-AI-Collective/axolotl/issues/new).

PRs are **greatly welcome**!

Please run below to setup env
```bash
pip3 install -r requirements-dev.txt -r requirements-tests.txt
pre-commit install

# test
pytest tests/
```
1 change: 0 additions & 1 deletion docker/Dockerfile-base
Original file line number Diff line number Diff line change
Expand Up @@ -99,4 +99,3 @@ RUN pip3 install "peft @ git+https://github.com/huggingface/peft.git@main" \
pip3 install awscli && \
# The base image ships with `pydantic==1.8.2` which is not working
pip3 install -U --no-cache-dir pydantic

1 change: 0 additions & 1 deletion examples/falcon/config-7b-lora.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,4 +61,3 @@ special_tokens:
pad_token: "<|endoftext|>"
bos_token: ">>ABSTRACT<<"
eos_token: "<|endoftext|>"

1 change: 0 additions & 1 deletion examples/falcon/config-7b.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,4 +61,3 @@ special_tokens:
pad_token: "<|endoftext|>"
bos_token: ">>ABSTRACT<<"
eos_token: "<|endoftext|>"

3 changes: 3 additions & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
pre-commit
black
mypy
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ bitsandbytes>=0.39.0
addict
fire
PyYAML==6.0
black
datasets
accelerate>=0.19.0
sentencepiece
Expand Down
24 changes: 19 additions & 5 deletions scripts/alpaca_json_to_jsonl.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,38 @@
"""Module to convert json file to jsonl"""

import os
import sys
from pathlib import Path
from typing import Optional, Union

import fire
from typing import Optional

from axolotl.convert import (
FileReader,
FileWriter,
JsonlSerializer,
JsonParser,
JsonToJsonlConverter,
StdoutWriter,
)

# add src to the pythonpath so we don't need to pip install this
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
src_dir = os.path.join(project_root, "src")
sys.path.insert(0, src_dir)

from axolotl.convert import *


def main(
input: Path,
file: Path,
output: Optional[Path] = None,
to_stdout: Optional[bool] = False,
):
"""
Convert a json file to jsonl
"""

file_reader = FileReader()
writer: Union[StdoutWriter, FileWriter]
if to_stdout or output is None:
writer = StdoutWriter()
else:
Expand All @@ -28,7 +42,7 @@ def main(

converter = JsonToJsonlConverter(file_reader, writer, json_parser, jsonl_serializer)

converter.convert(input, output)
converter.convert(file, output)


if __name__ == "__main__":
Expand Down
64 changes: 36 additions & 28 deletions scripts/finetune.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,49 @@
"""Prepare and train a model on a dataset. Can also infer from a model or merge lora"""

import importlib
import logging
import os
import random
import signal
import sys
from pathlib import Path
from typing import Optional, List, Dict, Any, Union
from typing import Any, Dict, List, Optional, Union

import fire
import torch
import yaml

from axolotl.utils.data import load_prepare_datasets
from axolotl.utils.dict import DictDefault
from axolotl.utils.models import load_model, load_tokenizer

# add src to the pythonpath so we don't need to pip install this
from axolotl.utils.tokenization import check_dataset_labels
from axolotl.utils.trainer import setup_trainer
from axolotl.utils.validation import validate_config
from axolotl.utils.dict import DictDefault
from axolotl.utils.wandb import setup_wandb_env_vars

project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
src_dir = os.path.join(project_root, "src")
sys.path.insert(0, src_dir)

from axolotl.utils.data import load_prepare_datasets
from axolotl.utils.models import load_model, load_tokenizer
from axolotl.utils.trainer import setup_trainer
from axolotl.utils.wandb import setup_wandb_env_vars

logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO"))
DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"


def choose_device(cfg):
def get_device():
if torch.cuda.is_available():
return f"cuda:{cfg.local_rank}"
else:
try:
if torch.backends.mps.is_available():
return "mps"
except:
return "cpu"
try:
if torch.cuda.is_available():
return f"cuda:{cfg.local_rank}"

if torch.backends.mps.is_available():
return "mps"

raise SystemError("No CUDA/mps device found")
except Exception: # pylint: disable=broad-exception-caught
return "cpu"

cfg.device = get_device()
if cfg.device == "cuda":
Expand All @@ -51,7 +56,7 @@ def get_multi_line_input() -> Optional[str]:
print("Give me an instruction (Ctrl + D to finish): ")
instruction = ""
for line in sys.stdin:
instruction += line
instruction += line # pylint: disable=consider-using-join
# instruction = pathlib.Path("/proc/self/fd/0").read_text()
return instruction

Expand Down Expand Up @@ -92,7 +97,7 @@ def do_inference(cfg, model, tokenizer, prompter="AlpacaPrompter"):


def choose_config(path: Path):
yaml_files = [file for file in path.glob("*.yml")]
yaml_files = list(path.glob("*.yml"))

if not yaml_files:
raise ValueError(
Expand Down Expand Up @@ -130,12 +135,12 @@ def train(
config = choose_config(config)

# load the config from the yaml file
with open(config, "r") as f:
cfg: DictDefault = DictDefault(yaml.load(f, Loader=yaml.Loader))
with open(config, encoding="utf-8") as file:
cfg: DictDefault = DictDefault(yaml.safe_load(file))
# if there are any options passed in the cli, if it is something that seems valid from the yaml,
# then overwrite the value
cfg_keys = cfg.keys()
for k in kwargs:
for k, _ in kwargs.items():
# if not strict, allow writing to cfg even if it's not in the yml already
if k in cfg_keys or cfg.strict is False:
# handle booleans
Expand Down Expand Up @@ -167,13 +172,11 @@ def train(

# load the tokenizer first
logging.info("loading tokenizer...")
tokenizer = load_tokenizer(
cfg.base_model_config,
cfg.tokenizer_type,
cfg
)
tokenizer = load_tokenizer(cfg.base_model_config, cfg.tokenizer_type, cfg)

if check_not_in(["inference", "shard", "merge_lora"], kwargs): # don't need to load dataset for these
if check_not_in(
["inference", "shard", "merge_lora"], kwargs
): # don't need to load dataset for these
train_dataset, eval_dataset = load_prepare_datasets(
tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
)
Expand All @@ -182,7 +185,7 @@ def train(
logging.info("check_dataset_labels...")
check_dataset_labels(
train_dataset.select(
[random.randrange(0, len(train_dataset) - 1) for i in range(5)]
[random.randrange(0, len(train_dataset) - 1) for _ in range(5)] # nosec
),
tokenizer,
)
Expand Down Expand Up @@ -239,7 +242,10 @@ def train(
if cfg.local_rank == 0:
signal.signal(
signal.SIGINT,
lambda signal, frame: (model.save_pretrained(cfg.output_dir), exit(0)),
lambda signal, frame: (
model.save_pretrained(cfg.output_dir),
sys.exit(0),
),
)

logging.info("Starting trainer...")
Expand All @@ -252,7 +258,8 @@ def train(
]
if len(possible_checkpoints) > 0:
sorted_paths = sorted(
possible_checkpoints, key=lambda path: int(path.split("-")[-1])
possible_checkpoints,
key=lambda path: int(path.split("-")[-1]),
)
resume_from_checkpoint = sorted_paths[-1]
logging.info(
Expand All @@ -266,6 +273,7 @@ def train(
# only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file
if cfg.local_rank == 0:
model.save_pretrained(cfg.output_dir)

# trainer.save_model(cfg.output_dir) # TODO this may be needed for deepspeed to work? need to review another time


Expand Down
6 changes: 4 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from setuptools import setup, find_packages
"""setup.py for axolotl"""

from setuptools import find_packages, setup

install_requires = []
with open("./requirements.txt", "r") as requirements_file:
with open("./requirements.txt", encoding="utf-8") as requirements_file:
# don't include peft yet until we check the int4
# need to manually install peft for now...
reqs = [r.strip() for r in requirements_file.readlines() if "peft" not in r]
Expand Down
Loading

0 comments on commit 1f9089d

Please sign in to comment.