diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000..528f30c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000..e69de29 diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 0000000..e69de29 diff --git a/.env_example b/.env_example new file mode 100644 index 0000000..4a28324 --- /dev/null +++ b/.env_example @@ -0,0 +1,24 @@ +S3_API_HOST= +S3_BUCKET_NAME= +S3_ACCESS_KEY= +S3_SECRET_KEY= + +DVC_USE_SSL=True +DVC_REMOTE_URL= +DVC_REMOTE_NAME=minio + + +MLFLOW_POSTGRES_DB=postgres_mlflow +MLFLOW_POSTGRES_USER=mlflow +MLFLOW_POSTGRES_PASSWORD=mlflow_password +MLFLOW_POSTGRES_PORT=5432 +MLFLOW_POSTGRES_HOST=mlflow-postgres + +BACKEND_STORE_URI=postgresql://${MLFLOW_POSTGRES_USER}:${MLFLOW_POSTGRES_PASSWORD}@localhost:${MLFLOW_POSTGRES_PORT}/${MLFLOW_POSTGRES_DB} +MLFLOW_TRACKING_URI=http://localhost:5000 +MLFLOW_S3_ENDPOINT_URL=https:// +MLFLOW_S3_REMOTE_URL= +MLFLOW_PORT=5000 + +AWS_ACCESS_KEY_ID= +AWS_SECRET_ACCESS_KEY= \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..82f9275 --- /dev/null +++ b/.gitignore @@ -0,0 +1,162 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..7475dd3 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,48 @@ +# default_stages: [commit, push] +# Список файлов, которые необходимо игнорировать при проверке pre-commit hooks +exclude: '^(models/|data/|notebooks/|config/|mlflow/|.dvc/)' + +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace + - id: check-merge-conflict + - id: check-added-large-files + args: ['--maxkb=10000'] + - repo: local + hooks: + # FLAKE 8 + - id: flake8 + name: flake8 + entry: pflake8 --config pyproject.toml . + language: system + types: [ python ] + # RUFF + - id: ruff + name: ruff + entry: poetry run ruff check + language: system + types: [ python ] + # MYPY + # - id: mypy + # name: mypy + # entry: poetry run mypy + # require_serial: true + # language: system + # # args: [--strict] + + # PYLINT + - id: pylint + name: pylint + entry: poetry run pylint + language: system + types: [python] + args: + [ + "-rn", # Only display messages + "-sn", # Don't display the score + "--rcfile=pyproject.toml" + ] \ No newline at end of file diff --git a/README.md b/README.md index a2b2253..f577dca 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,102 @@ # ML-DL-Repository-Template Шаблон репозитория для классического ML/DL проекта + +**Задача:** Тут можно кратко описать задачу... + +## Чтобы запустить проект локально: +1. Установите python 3.11 и выше +2. Склонировать репозиторий: ``git clone ...`` +3. Создать виртуальное окружение: ``python -m venv venv`` +4. Активироавть виртуальное окружение: ``venv\Scripts\activate`` (для windows) +5. Заполните ``.env`` файл: +6. Соберите проект: ``python setup.py`` + +``` +S3_API_HOST= +S3_BUCKET_NAME= +S3_ACCESS_KEY= +S3_SECRET_KEY= + +DVC_USE_SSL=True +DVC_REMOTE_URL=s3://ml-team-spb/businessguarantees +DVC_REMOTE_NAME=minio + + +MLFLOW_POSTGRES_DB=postgres_mlflow +MLFLOW_POSTGRES_USER=mlflow +MLFLOW_POSTGRES_PASSWORD=mlflow_password +MLFLOW_POSTGRES_PORT=5432 +MLFLOW_POSTGRES_HOST=mlflow-postgres + +# MLFLOW S3 +BACKEND_STORE_URI=postgresql://${MLFLOW_POSTGRES_USER}:${MLFLOW_POSTGRES_PASSWORD}@localhost:${MLFLOW_POSTGRES_PORT}/${MLFLOW_POSTGRES_DB} +MLFLOW_TRACKING_URI=http://localhost:5000 +MLFLOW_S3_ENDPOINT_URL=S3_API_HOST +MLFLOW_S3_REMOTE_URL= + +AWS_ACCESS_KEY_ID= +AWS_SECRET_ACCESS_KEY= +MLFLOW_PORT=5000 +``` + +## Чтобы запустить mlflow: +1. Получить доступ к докеру и настроить его +2. Заполнить env файл +3. Запустить mlflow (желательно в отдельной консоли): ``python mlflow/mlflow_start.py`` + +## Принцип ведения git: + +Во время разработки следует следует создавать новые ветки из ``main``. + +1. Чтобы добавить новый функционал, создаем ветку ``название_нового_функционала`` от ``main``. + +(Хочу добавить поддержку линейной регрессии -> название ветки: ``add: linear_reg_support``) + +2. Чтобы пофиксить баг создаем ветку ``fix: описание_бага`` от ``main`` + +(Хочу добавить пофиксить баг подключение к БД -> название ветки: ``fix: bd_connection``) + +3. Чтобы что-то инициализировать ``init: описание`` + +## Полезные git команды: +1. ``git checkout -b branch_name`` - создать ветку и переключиться на нее +2. ``git branch`` - отобразить все существующие ветки +3. ``git push -u origin new branch`` - отправить новую ветку в удаленный репозиторий +4. ``git reset -soft HEAD~1`` - удалить последний 1 коммит, но сохранить изменения +5. ``git branch -d `` - удалить ветку из локального репозиторию +6. ``git checkout current_branch`` ``->`` ``git merge target_branch`` – подтянуть изменения из ``target_branch`` ветки в ``current_branch`` +7. ``git checkout --track -b local_branch_name origin/remote_branch_name`` – Чтобы склонировать конкретную существующую в удаленном репозитории ветку, нужно ввести команду +8. ``git rm --cached filename`` (если директория, то ``-r filename``) – удалить что-то из всевидящего GIT - ока: + +## Полезные команды Poetry: +1. ``poetry update`` - обновить зависимость по pyproject.toml файлу +2. ``poetry install`` - установить все зависимости по pyproject.toml и poetry.lock файлам +3. ``poetry add --group group_name lib_name`` - добавить библиотеку в зависимости (``--group group_name`` - не обязательно) +4. ``poetry remove lib_name`` - удалить библиотеку из зависимостей +5. ``poetry run pre-commit run --all-files`` - запустить pre-commit хуки + +## Полезные команды DVC: +1. ``dvc init`` - инициализировать все dvc файлы +2. ``dvc add path/to/...`` - добавить в dvc папку/файл +3. ``dvc commit`` - Фиксирует изменения +4. ``dvc push `` - загрузить измененные данные в s3 +5. ``dvc pull`` - выгрузить данные из s3 +6. ``dvc remote list`` - показать список доступных хранилищ +7. ``dvc diff`` - показать разницу между предыдущими версиями +8. ``dvc checkout`` - восстановить данные из предыдущего коммита + +## Полезные команды Docker: +1. ``docker-compose down -v`` - (удалить контейнеры, включая volumes) +2. ``docker-compose down`` - (удалить контейнеры) +3. ``docker-compose build --no-cache`` - пересобрать, не используя кэш +4. ``docker-compose build`` - пересобрать +5. ``docker-compose up -d`` - запустить в detouch моде (логи не отображаются в консоли) +6. ``docker-compose up -d --build`` - запустить в detouch моде и пересобрать +7. ``docker ps`` - вывести список запущенных контейнеров + +## Полезные команды: +1. ``set PYTHONPATH=%PYTHONPATH%;C:\Users\\PycharmProjects\`` - перед запуском jupyter notebook. Чтобы корректно работали во вложенных папках, а не только в корне (для PyCharm) +2. Установить прокси совкомбанка: + - ``set http_proxy=http://proxy-server`` + - ``set https_proxy=http://proxy-server`` + - ``set no_proxylocalhost,127.0.0.1,192.168.*,10.60.*`` diff --git a/config/__init__.py b/config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/config/config.py b/config/config.py new file mode 100644 index 0000000..e69de29 diff --git a/data/data.csv b/data/data.csv new file mode 100644 index 0000000..e69de29 diff --git a/data/img/img.jpeg b/data/img/img.jpeg new file mode 100644 index 0000000..e69de29 diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..7e6556f --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,24 @@ +version: '3' + +services: + mlflow-postgres: + image: postgres:13.3 + ports: + - ${MLFLOW_POSTGRES_PORT}:5432 + env_file: + - .env + environment: + - POSTGRES_DB=${MLFLOW_POSTGRES_DB} + - POSTGRES_USER=${MLFLOW_POSTGRES_USER} + - POSTGRES_PASSWORD=${MLFLOW_POSTGRES_PASSWORD} + - POSTGRES_HOST=${MLFLOW_POSTGRES_HOST} + networks: + - ml_network + healthcheck: + test: ["CMD-SHELL", "pg_isready -U $${MLFLOW_POSTGRES_USER} -d $${MLFLOW_POSTGRES_DB}"] + interval: 60s + timeout: 10s + retries: 3 + restart: on-failure +networks: + ml_network: diff --git a/mlflow/Dockerfile b/mlflow/Dockerfile new file mode 100644 index 0000000..7733a6f --- /dev/null +++ b/mlflow/Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.11 + +WORKDIR /mlflow/ + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt && \ + rm requirements.txt + +EXPOSE 5000 + +CMD mlflow server --host 0.0.0.0 --port 5000 --backend-store-uri ${MLFLOW_BACKEND_STORE_URI} --default-artifact-root ${MLFLOW_DEFAULT_ARTIFACT_ROOT} diff --git a/mlflow/mlflow_start.py b/mlflow/mlflow_start.py new file mode 100644 index 0000000..b252fba --- /dev/null +++ b/mlflow/mlflow_start.py @@ -0,0 +1,38 @@ +import os +import subprocess + +from dotenv import load_dotenv + +load_dotenv() + + +def run_command(command: str) -> None: + """Выполняет указанную команду""" + subprocess.run(command, shell=True, check=True) + + +def setup_mlflow_server() -> None: + """Запускает MLflow сервер с заданными переменными окружения""" + + try: + # Формирование команды для запуска MLflow сервера + mlflow_command = ( + f'mlflow server --host 0.0.0.0 --port {os.getenv("MLFLOW_PORT")} ' + f'--backend-store-uri {os.getenv("BACKEND_STORE_URI")} ' + f'--default-artifact-root {os.getenv("MLFLOW_S3_REMOTE_URL")} ' + f'--artifacts-destination {os.getenv("MLFLOW_S3_REMOTE_URL")} ' + ) + + # Запускаем БД + run_command("docker-compose up -d") + + # Запуск MLflow сервера + run_command(mlflow_command) + except KeyboardInterrupt: + pass + finally: + run_command("docker-compose stop") + + +if __name__ == "__main__": + setup_mlflow_server() diff --git a/mlflow/requirements.txt b/mlflow/requirements.txt new file mode 100644 index 0000000..43f8876 --- /dev/null +++ b/mlflow/requirements.txt @@ -0,0 +1,3 @@ +mlflow==2.13.2 +psycopg2 +boto3 \ No newline at end of file diff --git a/models/model.pkl b/models/model.pkl new file mode 100644 index 0000000..e69de29 diff --git a/notebooks/notebook.ipynb b/notebooks/notebook.ipynb new file mode 100644 index 0000000..e69de29 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8a907d5 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,142 @@ +[tool.poetry] +name = "Project" +version = "0.1.0" +description = "" +authors = [""] +readme = "README.md" +packages = [ + { include = "src" } +] + +# Основные зависимости +[tool.poetry.dependencies] +python = "^3.11" +pandas = "^2.2.2" +scikit-learn = "^1.4.2" +optuna = "^3.6.1" +joblib = "^1.4.2" +openpyxl = "^3.1.2" +matplotlib = "^3.9.0" +requests = "^2.31.0" +scipy = "^1.13.0" +tqdm = "^4.66.4" +statsmodels = "^0.14.2" +seaborn = "^0.13.2" +dvc = {extras = ["s3"], version = "^3.50.2"} +python-dotenv = "^1.0.1" +pydantic = "^2.7.1" +pydantic-settings = "^2.2.1" +pyarrow = "15.0.0" +fastparquet = "^2024.5.0" +shap = "^0.45.1" +catboost = "^1.2.5" + +[tool.poetry.group.dev.dependencies] # Зависимости для DEV версии +ruamel-yaml = "0.18.5" +pylint = "^3.2.0" +pre-commit = "^3.7.1" +flake8 = "^7.0.0" +ruff = "^0.4.4" +pre-commit-hooks = "^4.6.0" +pyproject-flake8 = "^7.0.0" +ipykernel = "^6.29.4" +ipywidgets = "^8.1.2" +jupyter = "^1.0.0" +mypy = "^1.10.0" +mlflow = "^2.13.2" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +# MYPY CONFIG +[tool.mypy] +ignore_missing_imports = true +exclude = [ + "^.git/", + "^__pycache__/", + "^venv/", + "^models/", + "^data/", + "^notebooks/", + "^config/", + "^mlflow/", + "^.dvc/" +] + +[[tool.mypy.overrides]] +module = "tqdm" +ignore_errors = true + +[[tool.mypy.overrides]] +module = "selenium" +ignore_errors = true + +[[tool.mypy.overrides]] +module = "catboost" +ignore_errors = true + +[[tool.mypy.overrides]] +module = "pymorphy2" +ignore_errors = true + +# RUFF CONFIG +[tool.ruff] # RUFF CONFIG: +line-length = 120 + +[tool.ruff.lint] +ignore = ["F841", "E722"] + +# PYLINT CONFIG +[tool.pylint."MESSAGES CONTROL"] # PYLINT CONFIG: +disable = [ + "C0114", # missing module docstring + "C0103", # Constant name doesn't conform to UPPER_CASE naming style + "W0612", # Unused import + "W0613", # Unused argument + "R0801", # simillar 2 lines + "R0903", # Too few public methods (1/2) +] + +[tool.pylint.FORMAT] +max-line-length = 120 + +[tool.pylint.BASIC] +max-args = 8 +max-attributes = 15 +good-names = ["i", "j", "k", "ex", "Run", "_"] +bad-names = ["foo", "bar", "baz", "toto", "tutu", "tata", "bebra", "lebra", "aboba"] + +[tool.pylint.MISCELLANEOUS] +notes = ["FIXME", "XXX", "TODO"] + +[tool.pylint.SIMILARITIES] +min-similarity-lines = 4 +ignore-comments = true +ignore-docstrings = true +ignore-imports = false + +# FLAKE 8 CONFIG: +[tool.flake8] +exclude = [ + ".git", + "__pycache__", + "venv", + "__init__.py", + "models", + "data", + "notebooks", + "config", + "mlflow", + ".dvc" +] +max-line-length = 120 +ignore = [ + "E203", + "F401", + "F541", + "F841", + "E722", + "E241", + "W503" +] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..dd27a46 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +python-dotenv +poetry \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..0240b65 --- /dev/null +++ b/setup.py @@ -0,0 +1,35 @@ +import subprocess + +from dotenv import load_dotenv + +load_dotenv() + + +def run_command(command: str) -> None: + """Выполняет указанную команду""" + subprocess.run(command, shell=True, check=True) + + +def setup() -> None: + """Собирает проект полностью""" + # 1. Обновить pip + run_command("python -m pip install --upgrade pip") + + # 2. Установить зависимости для poetry + run_command("pip install -r requirements.txt") + + # 3. Установить остальные зависимости с помощью poetry + run_command("poetry install") + + # 4. Заполнить config для dvc + run_command("python .dvc/setup_dvc.py") + + # 5. Подгрузить актуальные модели и датасеты из s3 + run_command("dvc pull") + + # 6. Подгрузить актуальные модели и датасеты из s3 + run_command("poetry run pre-commit install") + + +if __name__ == "__main__": + setup() \ No newline at end of file diff --git a/setup_dvc.py b/setup_dvc.py new file mode 100644 index 0000000..7dba149 --- /dev/null +++ b/setup_dvc.py @@ -0,0 +1,24 @@ +import os + +from dotenv import load_dotenv + +# Загрузка переменных окружения из файла +env_path = ".env" +load_dotenv(dotenv_path=env_path) + +# Получение значений из переменных окружения +remote_name = os.getenv("DVC_REMOTE_NAME") +remote_url = os.getenv("DVC_REMOTE_URL") +endpoint_url = os.getenv("S3_API_HOST") +access_key_id = os.getenv("S3_ACCESS_KEY") +secret_access_key = os.getenv("S3_SECRET_KEY") +use_ssl = os.getenv("DVC_USE_SSL") + +# Настройка DVC удаленного хранилища +os.system(f"dvc remote add -d {remote_name} {remote_url}") +os.system(f"dvc remote modify {remote_name} endpointurl {'https://' + endpoint_url}") +os.system(f"dvc remote modify {remote_name} access_key_id {access_key_id}") +os.system(f"dvc remote modify {remote_name} secret_access_key {secret_access_key}") +os.system(f"dvc remote modify {remote_name} use_ssl {use_ssl}") + +print("DVC configuration has been set up.") diff --git a/src/interfaces/interface.py b/src/interfaces/interface.py new file mode 100644 index 0000000..e69de29 diff --git a/src/interfaces/processor_interface.py b/src/interfaces/processor_interface.py new file mode 100644 index 0000000..f8088a8 --- /dev/null +++ b/src/interfaces/processor_interface.py @@ -0,0 +1,48 @@ +# Интерфейс для процессоров +import pandas as pd +from joblib import Parallel, delayed + + +class ProcessorInterface: + """Метод, реализующие интерфейс для процессора (выделение фич, препроцессинг...)""" + + def __init__(self) -> None: + pass + + def feature_construct(self, df: pd.DataFrame, column_name: str = "trn_purp", is_cuda: bool = False) -> pd.DataFrame: + """Метод для выделения фич""" + return df + + def multi_feature_construct( + self, df: pd.DataFrame, column_name: str, chunk_size: int = 5000, n_cores: int = 8, is_cuda: bool = False + ) -> pd.DataFrame: + """Метод для параллельного выделения фич""" + dfs = [df[i : i + chunk_size] for i in range(0, len(df), chunk_size)] + results = Parallel(n_jobs=n_cores)(delayed(self.feature_construct)(X, column_name, is_cuda) for X in dfs) + + # Если получился словарь (для эмбеддингов): + if isinstance(results[0], dict): + data = {} + for d in results: + data.update(d) + # Если получился датафрейм (для остального): + elif isinstance(results[0], (pd.DataFrame, pd.Series)): + data = pd.concat(results) + return data + + def transform( + self, + df: pd.DataFrame, + column_name: str = "trn_purp", + chunk_size: int = 500, + n_jobs: int = 1, + n_cores: int = 8, + is_cuda: bool = False, + ) -> pd.DataFrame: + """Метод для преобразования входящего df с возможностью выбора синхронного или параллельного режима""" + if n_jobs == 1: + df = self.feature_construct(df, column_name, is_cuda=is_cuda) + elif n_jobs == -1: + df = self.multi_feature_construct(df, column_name, chunk_size=chunk_size, n_cores=n_cores, is_cuda=is_cuda) + + return df diff --git a/src/pipeline/pipeline.py b/src/pipeline/pipeline.py new file mode 100644 index 0000000..e69de29 diff --git a/src/processors/precessor.py b/src/processors/precessor.py new file mode 100644 index 0000000..e69de29 diff --git a/src/scripts/script.py b/src/scripts/script.py new file mode 100644 index 0000000..e69de29