diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 14ccae7191..004b6df09d 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -44,6 +44,7 @@ from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM from llmfoundry.models.utils import init_empty_weights +from llmfoundry.utils.exceptions import StoragePermissionError from llmfoundry.utils.huggingface_hub_utils import \ edit_files_for_hf_compatibility @@ -297,7 +298,16 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None: + f'Got {type(state.model)} instead.', ) if self.remote_ud is not None: - self.remote_ud.init(state, logger) + try: + self.remote_ud.init(state, logger) + except PermissionError as e: + if 'Client Error' in str( + e, + ): # thrown from composer.utils._wrap_mlflow_exceptions + raise StoragePermissionError( + 'Error when write to save_folder.', + ) from e + raise e state.callbacks.append(self.remote_ud) if self.mlflow_registered_model_name is not None: diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py index c89e13749c..e8de7ac7eb 100644 --- a/llmfoundry/utils/exceptions.py +++ b/llmfoundry/utils/exceptions.py @@ -30,6 +30,7 @@ 'MisconfiguredHfDatasetError', 'DatasetTooSmallError', 'RunTimeoutError', + 'StoragePermissionError', 'UCNotEnabledError', 'DeltaTableNotFoundError', ] @@ -528,6 +529,21 @@ def __str__(self): return self.message +class StoragePermissionError(UserError): + """Error thrown due to invalid permissions accessing blob storage.""" + + def __init__(self, message: str) -> None: + self.message = message + super().__init__(message) + + def __reduce__(self): + # Return a tuple of class, a tuple of arguments, and optionally state + return (StoragePermissionError, (self.message,)) + + def __str__(self): + return self.message + + class UCNotEnabledError(UserError): """Error thrown when user does not have UC enabled on their cluster."""