Merge branch 'main' into xiaohan/enable_extra_arg_test

mosaicml · Nov 22, 2024 · f7942f4 · f7942f4
2 parents fda9c12 + 6ce9bb7
commit f7942f4
Show file tree

Hide file tree

Showing 17 changed files with 344 additions and 111 deletions.
diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
@@ -17,11 +17,11 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: "2.4.0_cu124"
-          base_image: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
+        - name: "2.5.1_cu124"
+          base_image: mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu22.04
           dep_groups: "[all]"
-        - name: "2.4.0_cu124_aws"
-          base_image: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
+        - name: "2.5.1_cu124_aws"
+          base_image: mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu22.04-aws
           dep_groups: "[all]"
     steps:
 

diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
@@ -21,9 +21,9 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: "cpu-2.4.0"
+        - name: "cpu-2.5.1"
           pip_deps: "[all-cpu]"
-          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.5.1_cpu-python3.11-ubuntu22.04
           markers: "not gpu"
           pytest_command: "coverage run -m pytest"
     steps:

diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
@@ -22,8 +22,8 @@ jobs:
       fail-fast: false
       matrix:
         include:
-        - name: "gpu-2.4.0-1"
-          container: mosaicml/llm-foundry:2.4.0_cu124-latest
+        - name: "gpu-2.5.1-1"
+          container: mosaicml/llm-foundry:2.5.1_cu124-latest
           markers: "gpu"
           pip_deps: "[all]"
           pytest_command: "coverage run -m pytest"
@@ -51,8 +51,8 @@ jobs:
       fail-fast: false
       matrix:
         include:
-        - name: "gpu-2.4.0-2"
-          container: mosaicml/llm-foundry:2.4.0_cu124-latest
+        - name: "gpu-2.5.1-2"
+          container: mosaicml/llm-foundry:2.5.1_cu124-latest
           markers: "gpu"
           pip_deps: "[all]"
           pytest_command: "coverage run -m pytest"
@@ -80,8 +80,8 @@ jobs:
       fail-fast: false
       matrix:
         include:
-        - name: "gpu-2.4.0-4"
-          container: mosaicml/llm-foundry:2.4.0_cu124-latest
+        - name: "gpu-2.5.1-4"
+          container: mosaicml/llm-foundry:2.5.1_cu124-latest
           markers: "gpu"
           pip_deps: "[all]"
           pytest_command: "coverage run -m pytest"

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -93,7 +93,7 @@ jobs:
           ${{ env.AWS_DOCKER_TAG }}
           ${{ env.AWS_LATEST_TAG }}
         build-args: |
-          BASE_IMAGE=mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
+          BASE_IMAGE=mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu22.04-aws
           BRANCH_NAME=${{ env.BRANCH_NAME }}
           DEP_GROUPS=[all]
           KEEP_FOUNDRY=true
@@ -108,7 +108,7 @@ jobs:
           ${{ env.DOCKER_TAG }}
           ${{ env.LATEST_TAG }}
         build-args: |
-          BASE_IMAGE=mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
+          BASE_IMAGE=mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu22.04
           BRANCH_NAME=${{ env.BRANCH_NAME }}
           DEP_GROUPS=[all]
           KEEP_FOUNDRY=true
diff --git a/README.md b/README.md
@@ -113,24 +113,24 @@ If you have success/failure using LLM Foundry on other systems, please let us kn
 
 | Device         | Torch Version | Cuda Version | Status                       |
 | -------------- | ------------- | ------------ | ---------------------------- |
-| A100-40GB/80GB | 2.4.0         | 12.4         | :white_check_mark: Supported |
-| H100-80GB      | 2.4.0         | 12.4         | :white_check_mark: Supported |
+| A100-40GB/80GB | 2.5.1         | 12.4         | :white_check_mark: Supported |
+| H100-80GB      | 2.5.1         | 12.4         | :white_check_mark: Supported |
 
 ## MosaicML Docker Images
 We highly recommend using our prebuilt Docker images. You can find them here: https://hub.docker.com/orgs/mosaicml/repositories.
 
 The `mosaicml/pytorch` images are pinned to specific PyTorch and CUDA versions, and are stable and rarely updated.
 
 The `mosaicml/llm-foundry` images are built with new tags upon every commit to the `main` branch.
-You can select a specific commit hash such as `mosaicml/llm-foundry:2.4.0_cu124-36ab1ba` or take the latest one using `mosaicml/llm-foundry:2.4.0_cu124-latest`.
+You can select a specific commit hash such as `mosaicml/llm-foundry:2.5.1_cu124-9867a7b` or take the latest one using `mosaicml/llm-foundry:2.5.1_cu124-latest`.
 
 **Please Note:** The `mosaicml/llm-foundry` images do not come with the `llm-foundry` package preinstalled, just the dependencies. You will still need to `pip install llm-foundry` either from PyPi or from source.
 
 | Docker Image                                           | Torch Version | Cuda Version      | LLM Foundry dependencies installed? |
 | ------------------------------------------------------ | ------------- | ----------------- | ----------------------------------- |
-| `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04`  | 2.4.0         | 12.4 (Infiniband) | No                                  |
-| `mosaicml/llm-foundry:2.4.0_cu124-latest`              | 2.4.0         | 12.4 (Infiniband) | Yes                                 |
-| `mosaicml/llm-foundry:2.4.0_cu124_aws-latest`          | 2.4.0         | 12.4 (EFA)        | Yes                                 |
+| `mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu22.04`  | 2.5.1         | 12.4 (Infiniband) | No                                  |
+| `mosaicml/llm-foundry:2.5.1_cu124-latest`              | 2.5.1         | 12.4 (Infiniband) | Yes                                 |
+| `mosaicml/llm-foundry:2.5.1_cu124_aws-latest`          | 2.5.1         | 12.4 (EFA)        | Yes                                 |
 
 
 # Installation

diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
@@ -784,6 +784,10 @@ def tensor_hook(
 
         if dist.get_global_rank() == 0:
             if register_to_mlflow:
+                assert new_model_instance is not None
+                new_model_instance = self.transform_model_pre_registration(
+                    new_model_instance,
+                )
                 if self.using_peft:
 
                     # Save and register peft model to mlflow, this code path uses our older two step logic
@@ -798,10 +802,6 @@ def tensor_hook(
                         temp_save_dir,
                         'register_save',
                     )
-                    assert new_model_instance is not None
-                    new_model_instance = self.transform_model_pre_registration(
-                        new_model_instance,
-                    )
                     new_model_instance.save_pretrained(
                         register_save_dir,
                         max_shard_size='1GB',
@@ -860,9 +860,6 @@ def _save_and_register_peft_model(
         original_tokenizer: Optional[Any],
         save_dir: str,
     ):
-        new_model_instance = self.transform_model_pre_registration(
-            new_model_instance,
-        )
         components = {'model': new_model_instance}
         if original_tokenizer is not None:
             components['tokenizer'] = original_tokenizer

diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
@@ -1,6 +1,7 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
+import json
 import logging
 import os
 import re
@@ -27,6 +28,7 @@
     FaultyDataPrepCluster,
     InsufficientPermissionsError,
     MalformedUCTableError,
+    StoragePermissionError,
     UCNotEnabledError,
 )
 
@@ -681,7 +683,7 @@ def fetch_DT(
 
     log.info(f'Directory {json_output_folder} created.')
 
-    # validate_and_get_cluster_info allows cluster_id to be None if use_serverless is True
+    # Validate_and_get_cluster_info allows cluster_id to be None if use_serverless is True.
     method, dbsql, sparkSession = validate_and_get_cluster_info(
         cluster_id=cluster_id,
         databricks_host=DATABRICKS_HOST,
@@ -704,6 +706,14 @@ def fetch_DT(
             dbsql,
         )
     except (grpc.RpcError, spark_errors.SparkConnectGrpcException) as e:
+        if isinstance(
+            e,
+            spark_errors.SparkConnectGrpcException,
+        ) and 'is not Shared or Single User Cluster' in str(e):
+            raise FaultyDataPrepCluster(
+                message=
+                f'The cluster you have provided: {cluster_id} does not have data governance enabled. Please use a cluster with a data security mode other than NONE. {e}',
+            ) from e
         if isinstance(
             e,
             spark_errors.SparkConnectGrpcException,
@@ -732,12 +742,38 @@ def fetch_DT(
     if dbsql is not None:
         dbsql.close()
 
-    # combine downloaded jsonl into one big jsonl for IFT
+    # Combine downloaded jsonl into one big jsonl for IFT.
     iterative_combine_jsons(
         json_output_folder,
         os.path.join(json_output_folder, json_output_filename),
     )
 
+    _validate_written_file(
+        json_output_folder,
+        json_output_filename,
+        delta_table_name,
+    )
+
+
+def _validate_written_file(
+    json_output_folder: str,
+    json_output_filename: str,
+    delta_table_name: str,
+):
+    # Validate downloaded dataset is actually downloaded.
+    with open(os.path.join(json_output_folder, json_output_filename)) as f:
+        is_empty = True
+        for line in f.readlines():
+            is_empty = False
+            try:
+                json.loads(line)
+            except Exception as e:
+                raise ValueError(f'Line is not valid json: {line}') from e
+        if is_empty:
+            raise StoragePermissionError(
+                f'Unable to download {delta_table_name}, check network permissions.',
+            )
+
 
 def _check_imports():
     try:

diff --git a/llmfoundry/data/contrastive_pairs/dataloader.py b/llmfoundry/data/contrastive_pairs/dataloader.py
@@ -13,6 +13,7 @@
 import numpy as np
 import torch
 from composer.core import DataSpec
+from composer.utils import retry
 from streaming import Stream, StreamingDataset
 from torch.utils.data import DataLoader
 from transformers import PreTrainedTokenizerBase
@@ -136,6 +137,7 @@ def _get_contrastive_samples(
             'negative': negative_responses,
         }
 
+    @retry(BlockingIOError, num_attempts=5, initial_backoff=1.0, max_jitter=0.5)
     def __getitem__(self, idx: int) -> dict[str, list[int]]:
         sample = StreamingDataset.__getitem__(self, idx)
         text_samples = []

diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
@@ -79,6 +79,8 @@
 from llmfoundry.models.layers.norm import LPLayerNorm  # type: ignore
 # isort: on
 
+from llmfoundry.utils.warnings import VersionedDeprecationWarning
+
 log = logging.getLogger(__name__)
 
 CROSS_ENTROPY_IGNORE_INDEX = -100
@@ -1360,6 +1362,12 @@ def compute_loss_from_logits(
     else:
         loss = losses.sum() / (targets != loss_fn.ignore_index).sum()
         if sample_weighing_factor is not None:
+            warnings.warn(
+                VersionedDeprecationWarning(
+                    message='sample_weighing_factor has been deprecated!',
+                    remove_version='0.17.0',
+                ),
+            )
             if sample_weighing_factor.shape[0] > 1:
                 raise ValueError(
                     'Sample weighing factor is not supported when batch["sample_weighing_factor"].shape[0] > 1.',

diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py
@@ -409,8 +409,10 @@ def __init__(self, output_folder: str) -> None:
 class MisconfiguredHfDatasetError(UserError):
     """Error thrown when a HuggingFace dataset is misconfigured."""
 
-    def __init__(self, dataset_name: str, split: str) -> None:
+    def __init__(self, dataset_name: str, split: Optional[str] = None) -> None:
         message = f'Your dataset (name={dataset_name}, split={split}) is misconfigured. ' + \
+            'Please check your dataset format and make sure you can load your dataset locally.' \
+            if split is not None else f'Your dataset (name={dataset_name}) is misconfigured. ' + \
             'Please check your dataset format and make sure you can load your dataset locally.'
         super().__init__(message, dataset_name=dataset_name, split=split)
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 # build requirements
 [build-system]
-requires = ["setuptools < 68.0.0"]
+requires = ["setuptools < 76.0.0"]
 build-backend = "setuptools.build_meta"
 
 # iSort

diff --git a/scripts/inference/convert_composer_mpt_to_ft.py b/scripts/inference/convert_composer_mpt_to_ft.py
@@ -132,7 +132,7 @@ def write_ft_checkpoint_from_composer_checkpoint(
     # Extract the HF tokenizer
     print('#' * 30)
     print('Extracting HF Tokenizer...')
-    hf_tokenizer = get_hf_tokenizer_from_composer_state_dict(
+    hf_tokenizer = get_hf_tokenizer_from_composer_state_dict(  # pyright: ignore
         composer_state_dict,
         trust_remote_code,
     )
@@ -141,7 +141,7 @@ def write_ft_checkpoint_from_composer_checkpoint(
 
     # Extract the model weights
     weights_state_dict = composer_state_dict['state']['model']
-    torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(
+    torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(  # pyright: ignore
         weights_state_dict,
         prefix='model.',
     )

diff --git a/scripts/inference/convert_composer_to_hf.py b/scripts/inference/convert_composer_to_hf.py
@@ -133,7 +133,7 @@ def write_huggingface_pretrained_from_composer_checkpoint(
     weights_state_dict = composer_state_dict
     if 'state' in weights_state_dict:
         weights_state_dict = weights_state_dict['state']['model']
-    torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(
+    torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(  # pyright: ignore
         weights_state_dict,
         prefix='model.',
     )

diff --git a/setup.py b/setup.py
@@ -52,12 +52,12 @@
 ]
 
 install_requires = [
-    'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.26.0,<0.27',
+    'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.27.0,<0.28',
     'mlflow>=2.14.1,<2.18',
     'accelerate>=0.25,<1.2',  # for HF inference `device_map`
     'transformers>=4.43.2,<4.47',
     'mosaicml-streaming>=0.9.0,<0.10',
-    'torch>=2.4.0,<2.4.1',
+    'torch>=2.5.1,<2.5.2',
     'datasets>=2.20.0,<2.21',
     'fsspec==2023.6.0',  # newer version results in a bug in datasets that duplicates data
     'sentencepiece==0.2.0',
@@ -79,38 +79,38 @@
 extra_deps = {}
 
 extra_deps['dev'] = [
-    'coverage[toml]==7.6.1',
+    'coverage[toml]==7.6.4',
     'pre-commit>=3.4.0,<4',
     'pytest>=7.2.1,<9',
     'pytest_codeblocks>=0.16.1,<0.18',
-    'pytest-cov>=4,<6',
+    'pytest-cov>=4,<7',
     'pyright==1.1.256',
     'toml>=0.10.2,<0.11',
     'packaging>=21,<25',
     'hf_transfer==0.1.8',
 ]
 
 extra_deps['databricks'] = [
-    'mosaicml[databricks]>=0.26.0,<0.27',
+    'mosaicml[databricks]>=0.27.0,<0.28',
     'numpy<2',
     'databricks-sql-connector>=3,<4',
     'databricks-connect==14.1.0',
     'lz4>=4,<5',
 ]
 
 extra_deps['tensorboard'] = [
-    'mosaicml[tensorboard]>=0.26.0,<0.27',
+    'mosaicml[tensorboard]>=0.27.0,<0.28',
 ]
 
 # Flash 2 group kept for backwards compatibility
 extra_deps['gpu-flash2'] = [
-    'flash-attn>=2.6.3,<3',
+    'flash-attn==2.6.3',
 ]
 
 extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2'])
 
 extra_deps['peft'] = [
-    'mosaicml[peft]>=0.26.0,<0.27',
+    'mosaicml[peft]>=0.27.0,<0.28',
 ]
 
 extra_deps['openai'] = [