Skip to content

Commit

Permalink
Merge branch 'main' into xiaohan/enable_extra_arg_test
Browse files Browse the repository at this point in the history
  • Loading branch information
milocress authored Nov 22, 2024
2 parents fda9c12 + 6ce9bb7 commit f7942f4
Show file tree
Hide file tree
Showing 17 changed files with 344 additions and 111 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ jobs:
strategy:
matrix:
include:
- name: "2.4.0_cu124"
base_image: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
- name: "2.5.1_cu124"
base_image: mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu22.04
dep_groups: "[all]"
- name: "2.4.0_cu124_aws"
base_image: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
- name: "2.5.1_cu124_aws"
base_image: mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu22.04-aws
dep_groups: "[all]"
steps:

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/pr-cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ jobs:
strategy:
matrix:
include:
- name: "cpu-2.4.0"
- name: "cpu-2.5.1"
pip_deps: "[all-cpu]"
container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
container: mosaicml/pytorch:2.5.1_cpu-python3.11-ubuntu22.04
markers: "not gpu"
pytest_command: "coverage run -m pytest"
steps:
Expand Down
12 changes: 6 additions & 6 deletions .github/workflows/pr-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ jobs:
fail-fast: false
matrix:
include:
- name: "gpu-2.4.0-1"
container: mosaicml/llm-foundry:2.4.0_cu124-latest
- name: "gpu-2.5.1-1"
container: mosaicml/llm-foundry:2.5.1_cu124-latest
markers: "gpu"
pip_deps: "[all]"
pytest_command: "coverage run -m pytest"
Expand Down Expand Up @@ -51,8 +51,8 @@ jobs:
fail-fast: false
matrix:
include:
- name: "gpu-2.4.0-2"
container: mosaicml/llm-foundry:2.4.0_cu124-latest
- name: "gpu-2.5.1-2"
container: mosaicml/llm-foundry:2.5.1_cu124-latest
markers: "gpu"
pip_deps: "[all]"
pytest_command: "coverage run -m pytest"
Expand Down Expand Up @@ -80,8 +80,8 @@ jobs:
fail-fast: false
matrix:
include:
- name: "gpu-2.4.0-4"
container: mosaicml/llm-foundry:2.4.0_cu124-latest
- name: "gpu-2.5.1-4"
container: mosaicml/llm-foundry:2.5.1_cu124-latest
markers: "gpu"
pip_deps: "[all]"
pytest_command: "coverage run -m pytest"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ jobs:
${{ env.AWS_DOCKER_TAG }}
${{ env.AWS_LATEST_TAG }}
build-args: |
BASE_IMAGE=mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
BASE_IMAGE=mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu22.04-aws
BRANCH_NAME=${{ env.BRANCH_NAME }}
DEP_GROUPS=[all]
KEEP_FOUNDRY=true
Expand All @@ -108,7 +108,7 @@ jobs:
${{ env.DOCKER_TAG }}
${{ env.LATEST_TAG }}
build-args: |
BASE_IMAGE=mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
BASE_IMAGE=mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu22.04
BRANCH_NAME=${{ env.BRANCH_NAME }}
DEP_GROUPS=[all]
KEEP_FOUNDRY=true
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,24 +113,24 @@ If you have success/failure using LLM Foundry on other systems, please let us kn

| Device | Torch Version | Cuda Version | Status |
| -------------- | ------------- | ------------ | ---------------------------- |
| A100-40GB/80GB | 2.4.0 | 12.4 | :white_check_mark: Supported |
| H100-80GB | 2.4.0 | 12.4 | :white_check_mark: Supported |
| A100-40GB/80GB | 2.5.1 | 12.4 | :white_check_mark: Supported |
| H100-80GB | 2.5.1 | 12.4 | :white_check_mark: Supported |

## MosaicML Docker Images
We highly recommend using our prebuilt Docker images. You can find them here: https://hub.docker.com/orgs/mosaicml/repositories.

The `mosaicml/pytorch` images are pinned to specific PyTorch and CUDA versions, and are stable and rarely updated.

The `mosaicml/llm-foundry` images are built with new tags upon every commit to the `main` branch.
You can select a specific commit hash such as `mosaicml/llm-foundry:2.4.0_cu124-36ab1ba` or take the latest one using `mosaicml/llm-foundry:2.4.0_cu124-latest`.
You can select a specific commit hash such as `mosaicml/llm-foundry:2.5.1_cu124-9867a7b` or take the latest one using `mosaicml/llm-foundry:2.5.1_cu124-latest`.

**Please Note:** The `mosaicml/llm-foundry` images do not come with the `llm-foundry` package preinstalled, just the dependencies. You will still need to `pip install llm-foundry` either from PyPi or from source.

| Docker Image | Torch Version | Cuda Version | LLM Foundry dependencies installed? |
| ------------------------------------------------------ | ------------- | ----------------- | ----------------------------------- |
| `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04` | 2.4.0 | 12.4 (Infiniband) | No |
| `mosaicml/llm-foundry:2.4.0_cu124-latest` | 2.4.0 | 12.4 (Infiniband) | Yes |
| `mosaicml/llm-foundry:2.4.0_cu124_aws-latest` | 2.4.0 | 12.4 (EFA) | Yes |
| `mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu22.04` | 2.5.1 | 12.4 (Infiniband) | No |
| `mosaicml/llm-foundry:2.5.1_cu124-latest` | 2.5.1 | 12.4 (Infiniband) | Yes |
| `mosaicml/llm-foundry:2.5.1_cu124_aws-latest` | 2.5.1 | 12.4 (EFA) | Yes |


# Installation
Expand Down
11 changes: 4 additions & 7 deletions llmfoundry/callbacks/hf_checkpointer.py
Original file line number Diff line number Diff line change
Expand Up @@ -784,6 +784,10 @@ def tensor_hook(

if dist.get_global_rank() == 0:
if register_to_mlflow:
assert new_model_instance is not None
new_model_instance = self.transform_model_pre_registration(
new_model_instance,
)
if self.using_peft:

# Save and register peft model to mlflow, this code path uses our older two step logic
Expand All @@ -798,10 +802,6 @@ def tensor_hook(
temp_save_dir,
'register_save',
)
assert new_model_instance is not None
new_model_instance = self.transform_model_pre_registration(
new_model_instance,
)
new_model_instance.save_pretrained(
register_save_dir,
max_shard_size='1GB',
Expand Down Expand Up @@ -860,9 +860,6 @@ def _save_and_register_peft_model(
original_tokenizer: Optional[Any],
save_dir: str,
):
new_model_instance = self.transform_model_pre_registration(
new_model_instance,
)
components = {'model': new_model_instance}
if original_tokenizer is not None:
components['tokenizer'] = original_tokenizer
Expand Down
40 changes: 38 additions & 2 deletions llmfoundry/command_utils/data_prep/convert_delta_to_json.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright 2022 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

import json
import logging
import os
import re
Expand All @@ -27,6 +28,7 @@
FaultyDataPrepCluster,
InsufficientPermissionsError,
MalformedUCTableError,
StoragePermissionError,
UCNotEnabledError,
)

Expand Down Expand Up @@ -681,7 +683,7 @@ def fetch_DT(

log.info(f'Directory {json_output_folder} created.')

# validate_and_get_cluster_info allows cluster_id to be None if use_serverless is True
# Validate_and_get_cluster_info allows cluster_id to be None if use_serverless is True.
method, dbsql, sparkSession = validate_and_get_cluster_info(
cluster_id=cluster_id,
databricks_host=DATABRICKS_HOST,
Expand All @@ -704,6 +706,14 @@ def fetch_DT(
dbsql,
)
except (grpc.RpcError, spark_errors.SparkConnectGrpcException) as e:
if isinstance(
e,
spark_errors.SparkConnectGrpcException,
) and 'is not Shared or Single User Cluster' in str(e):
raise FaultyDataPrepCluster(
message=
f'The cluster you have provided: {cluster_id} does not have data governance enabled. Please use a cluster with a data security mode other than NONE. {e}',
) from e
if isinstance(
e,
spark_errors.SparkConnectGrpcException,
Expand Down Expand Up @@ -732,12 +742,38 @@ def fetch_DT(
if dbsql is not None:
dbsql.close()

# combine downloaded jsonl into one big jsonl for IFT
# Combine downloaded jsonl into one big jsonl for IFT.
iterative_combine_jsons(
json_output_folder,
os.path.join(json_output_folder, json_output_filename),
)

_validate_written_file(
json_output_folder,
json_output_filename,
delta_table_name,
)


def _validate_written_file(
json_output_folder: str,
json_output_filename: str,
delta_table_name: str,
):
# Validate downloaded dataset is actually downloaded.
with open(os.path.join(json_output_folder, json_output_filename)) as f:
is_empty = True
for line in f.readlines():
is_empty = False
try:
json.loads(line)
except Exception as e:
raise ValueError(f'Line is not valid json: {line}') from e
if is_empty:
raise StoragePermissionError(
f'Unable to download {delta_table_name}, check network permissions.',
)


def _check_imports():
try:
Expand Down
2 changes: 2 additions & 0 deletions llmfoundry/data/contrastive_pairs/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import numpy as np
import torch
from composer.core import DataSpec
from composer.utils import retry
from streaming import Stream, StreamingDataset
from torch.utils.data import DataLoader
from transformers import PreTrainedTokenizerBase
Expand Down Expand Up @@ -136,6 +137,7 @@ def _get_contrastive_samples(
'negative': negative_responses,
}

@retry(BlockingIOError, num_attempts=5, initial_backoff=1.0, max_jitter=0.5)
def __getitem__(self, idx: int) -> dict[str, list[int]]:
sample = StreamingDataset.__getitem__(self, idx)
text_samples = []
Expand Down
8 changes: 8 additions & 0 deletions llmfoundry/models/mpt/modeling_mpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@
from llmfoundry.models.layers.norm import LPLayerNorm # type: ignore
# isort: on

from llmfoundry.utils.warnings import VersionedDeprecationWarning

log = logging.getLogger(__name__)

CROSS_ENTROPY_IGNORE_INDEX = -100
Expand Down Expand Up @@ -1360,6 +1362,12 @@ def compute_loss_from_logits(
else:
loss = losses.sum() / (targets != loss_fn.ignore_index).sum()
if sample_weighing_factor is not None:
warnings.warn(
VersionedDeprecationWarning(
message='sample_weighing_factor has been deprecated!',
remove_version='0.17.0',
),
)
if sample_weighing_factor.shape[0] > 1:
raise ValueError(
'Sample weighing factor is not supported when batch["sample_weighing_factor"].shape[0] > 1.',
Expand Down
4 changes: 3 additions & 1 deletion llmfoundry/utils/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,8 +409,10 @@ def __init__(self, output_folder: str) -> None:
class MisconfiguredHfDatasetError(UserError):
"""Error thrown when a HuggingFace dataset is misconfigured."""

def __init__(self, dataset_name: str, split: str) -> None:
def __init__(self, dataset_name: str, split: Optional[str] = None) -> None:
message = f'Your dataset (name={dataset_name}, split={split}) is misconfigured. ' + \
'Please check your dataset format and make sure you can load your dataset locally.' \
if split is not None else f'Your dataset (name={dataset_name}) is misconfigured. ' + \
'Please check your dataset format and make sure you can load your dataset locally.'
super().__init__(message, dataset_name=dataset_name, split=split)

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# build requirements
[build-system]
requires = ["setuptools < 68.0.0"]
requires = ["setuptools < 76.0.0"]
build-backend = "setuptools.build_meta"

# iSort
Expand Down
4 changes: 2 additions & 2 deletions scripts/inference/convert_composer_mpt_to_ft.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def write_ft_checkpoint_from_composer_checkpoint(
# Extract the HF tokenizer
print('#' * 30)
print('Extracting HF Tokenizer...')
hf_tokenizer = get_hf_tokenizer_from_composer_state_dict(
hf_tokenizer = get_hf_tokenizer_from_composer_state_dict( # pyright: ignore
composer_state_dict,
trust_remote_code,
)
Expand All @@ -141,7 +141,7 @@ def write_ft_checkpoint_from_composer_checkpoint(

# Extract the model weights
weights_state_dict = composer_state_dict['state']['model']
torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(
torch.nn.modules.utils.consume_prefix_in_state_dict_if_present( # pyright: ignore
weights_state_dict,
prefix='model.',
)
Expand Down
2 changes: 1 addition & 1 deletion scripts/inference/convert_composer_to_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def write_huggingface_pretrained_from_composer_checkpoint(
weights_state_dict = composer_state_dict
if 'state' in weights_state_dict:
weights_state_dict = weights_state_dict['state']['model']
torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(
torch.nn.modules.utils.consume_prefix_in_state_dict_if_present( # pyright: ignore
weights_state_dict,
prefix='model.',
)
Expand Down
16 changes: 8 additions & 8 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,12 @@
]

install_requires = [
'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.26.0,<0.27',
'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.27.0,<0.28',
'mlflow>=2.14.1,<2.18',
'accelerate>=0.25,<1.2', # for HF inference `device_map`
'transformers>=4.43.2,<4.47',
'mosaicml-streaming>=0.9.0,<0.10',
'torch>=2.4.0,<2.4.1',
'torch>=2.5.1,<2.5.2',
'datasets>=2.20.0,<2.21',
'fsspec==2023.6.0', # newer version results in a bug in datasets that duplicates data
'sentencepiece==0.2.0',
Expand All @@ -79,38 +79,38 @@
extra_deps = {}

extra_deps['dev'] = [
'coverage[toml]==7.6.1',
'coverage[toml]==7.6.4',
'pre-commit>=3.4.0,<4',
'pytest>=7.2.1,<9',
'pytest_codeblocks>=0.16.1,<0.18',
'pytest-cov>=4,<6',
'pytest-cov>=4,<7',
'pyright==1.1.256',
'toml>=0.10.2,<0.11',
'packaging>=21,<25',
'hf_transfer==0.1.8',
]

extra_deps['databricks'] = [
'mosaicml[databricks]>=0.26.0,<0.27',
'mosaicml[databricks]>=0.27.0,<0.28',
'numpy<2',
'databricks-sql-connector>=3,<4',
'databricks-connect==14.1.0',
'lz4>=4,<5',
]

extra_deps['tensorboard'] = [
'mosaicml[tensorboard]>=0.26.0,<0.27',
'mosaicml[tensorboard]>=0.27.0,<0.28',
]

# Flash 2 group kept for backwards compatibility
extra_deps['gpu-flash2'] = [
'flash-attn>=2.6.3,<3',
'flash-attn==2.6.3',
]

extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2'])

extra_deps['peft'] = [
'mosaicml[peft]>=0.26.0,<0.27',
'mosaicml[peft]>=0.27.0,<0.28',
]

extra_deps['openai'] = [
Expand Down
Loading

0 comments on commit f7942f4

Please sign in to comment.