Skip to content

Commit

Permalink
lint
Browse files Browse the repository at this point in the history
  • Loading branch information
avishniakov committed Apr 19, 2024
1 parent 8856e5c commit 0c85e8b
Show file tree
Hide file tree
Showing 8 changed files with 64 additions and 24 deletions.
18 changes: 13 additions & 5 deletions llm-lora-finetuning/pipelines/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@


from steps import evaluate_model, finetune, prepare_data, promote
from utils.hashing import compute_md5
from zenml import logging as zenml_logging
from zenml import pipeline
from utils.hashing import compute_md5

zenml_logging.STEP_LOGS_STORAGE_MAX_MESSAGES = (
10000 # workaround for https://github.com/zenml-io/zenml/issues/2252
Expand All @@ -28,7 +28,11 @@

@pipeline
def llm_peft_full_finetune(
system_prompt: str, base_model_id: str, use_fast: bool = True, load_in_8bit: bool = False, load_in_4bit: bool = False
system_prompt: str,
base_model_id: str,
use_fast: bool = True,
load_in_8bit: bool = False,
load_in_4bit: bool = False,
):
"""Pipeline for finetuning an LLM with peft.
Expand All @@ -41,10 +45,14 @@ def llm_peft_full_finetune(
- promote: promote the model to the target stage, if evaluation was successful
"""
if not load_in_8bit and not load_in_4bit:
raise ValueError("At least one of `load_in_8bit` and `load_in_4bit` must be True.")
raise ValueError(
"At least one of `load_in_8bit` and `load_in_4bit` must be True."
)
if load_in_4bit and load_in_8bit:
raise ValueError("Only one of `load_in_8bit` and `load_in_4bit` can be True.")

raise ValueError(
"Only one of `load_in_8bit` and `load_in_4bit` can be True."
)

datasets_dir = prepare_data(
base_model_id=base_model_id,
system_prompt=system_prompt,
Expand Down
17 changes: 12 additions & 5 deletions llm-lora-finetuning/scripts/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,16 @@
from typing import List

import click
import torch
import transformers
from datasets import load_from_disk
from zenml.logger import get_logger

logger = get_logger(__name__)


@click.command(help="Technical wrapper to pass into the `accelerate launch` command.")
@click.command(
help="Technical wrapper to pass into the `accelerate launch` command."
)
@click.option(
"--base-model-id",
type=str,
Expand Down Expand Up @@ -284,15 +285,21 @@ def accelerated_finetune(
max_steps=max_steps,
learning_rate=lr,
logging_steps=(
min(logging_steps, max_steps) if max_steps >= 0 else logging_steps
min(logging_steps, max_steps)
if max_steps >= 0
else logging_steps
),
bf16=bf16,
optim=optimizer,
logging_dir="./logs",
save_strategy="steps",
save_steps=min(save_steps, max_steps) if max_steps >= 0 else save_steps,
save_steps=min(save_steps, max_steps)
if max_steps >= 0
else save_steps,
evaluation_strategy="steps",
eval_steps=min(eval_steps, max_steps) if max_steps >= 0 else eval_steps,
eval_steps=min(eval_steps, max_steps)
if max_steps >= 0
else eval_steps,
do_eval=True,
label_names=label_names,
),
Expand Down
2 changes: 1 addition & 1 deletion llm-lora-finetuning/steps/evaluate_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@
import evaluate
import torch
from datasets import load_from_disk
from utils.cuda import cleanup_memory
from utils.loaders import (
load_base_model,
load_pretrained_model,
)
from utils.tokenizer import load_tokenizer, tokenize_for_eval
from zenml import log_model_metadata, save_artifact, step
from zenml.logger import get_logger
from utils.cuda import cleanup_memory

logger = get_logger(__name__)

Expand Down
14 changes: 10 additions & 4 deletions llm-lora-finetuning/steps/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@
import torch
from materializers.directory_materializer import DirectoryMaterializer
from typing_extensions import Annotated
from utils.cuda import cleanup_memory
from zenml import logging as zenml_logging
from zenml import step
from zenml.logger import get_logger
from zenml.materializers import BuiltInMaterializer

from scripts.finetune import accelerated_finetune
from utils.cuda import cleanup_memory

logger = get_logger(__name__)
zenml_logging.STEP_LOGS_STORAGE_MAX_MESSAGES = (
Expand Down Expand Up @@ -108,7 +108,9 @@ def finetune(
else:
logger.info("Starting accelerate training job...")
ft_model_dir = "model_dir"
command = f"accelerate launch --num_processes {torch.cuda.device_count()} "
command = (
f"accelerate launch --num_processes {torch.cuda.device_count()} "
)
command += str(Path("scripts/finetune.py").absolute()) + " "
command += f'--base-model-id "{base_model_id}" '
command += f'--dataset-dir "{dataset_dir}" '
Expand All @@ -118,8 +120,12 @@ def finetune(
command += f"--save-steps {save_steps} "
command += f"--optimizer {optimizer} "
command += f"--lr {lr} "
command += f"--per-device-train-batch-size {per_device_train_batch_size} "
command += f"--gradient-accumulation-steps {gradient_accumulation_steps} "
command += (
f"--per-device-train-batch-size {per_device_train_batch_size} "
)
command += (
f"--gradient-accumulation-steps {gradient_accumulation_steps} "
)
command += f"--warmup-steps {warmup_steps} "
if bf16:
command += f"--bf16 "
Expand Down
4 changes: 2 additions & 2 deletions llm-lora-finetuning/steps/prepare_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,18 @@

from materializers.directory_materializer import DirectoryMaterializer
from typing_extensions import Annotated
from utils.cuda import cleanup_memory
from utils.tokenizer import generate_and_tokenize_prompt, load_tokenizer
from zenml import log_model_metadata, step
from zenml.materializers import BuiltInMaterializer
from utils.cuda import cleanup_memory


@step(output_materializers=[DirectoryMaterializer, BuiltInMaterializer])
def prepare_data(
base_model_id: str,
system_prompt: str,
dataset_name: str = "gem/viggo",
use_fast: bool = True
use_fast: bool = True,
) -> Annotated[Path, "datasets_dir"]:
"""Prepare the datasets for finetuning.
Expand Down
2 changes: 1 addition & 1 deletion llm-lora-finetuning/steps/promote.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
# limitations under the License.
#

from utils.cuda import cleanup_memory
from zenml import get_step_context, step
from zenml.client import Client
from zenml.logger import get_logger
from utils.cuda import cleanup_memory

logger = get_logger(__name__)

Expand Down
27 changes: 22 additions & 5 deletions llm-lora-finetuning/utils/hashing.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,32 @@
import sys
# Apache Software License 2.0
#
# Copyright (c) ZenML GmbH 2024. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import hashlib

BUF_SIZE = 65536
BUF_SIZE = 65536


def compute_md5(file_path:str)->str:
def compute_md5(file_path: str) -> str:
md5 = hashlib.md5()

with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
while True:
data = f.read(BUF_SIZE)
if not data:
break
md5.update(data)
return md5.hexdigest()
return md5.hexdigest()
4 changes: 3 additions & 1 deletion llm-lora-finetuning/utils/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,4 +133,6 @@ def tokenize_for_eval(
"""
for data_point in data_points["target"]
]
return tokenizer(eval_prompts, padding="longest", return_tensors="pt").to("cuda")
return tokenizer(eval_prompts, padding="longest", return_tensors="pt").to(
"cuda"
)

0 comments on commit 0c85e8b

Please sign in to comment.