Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Mlflow 403 PL UserError #1623

Merged
merged 11 commits into from
Oct 31, 2024
9 changes: 8 additions & 1 deletion llmfoundry/callbacks/hf_checkpointer.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
PreTrainedTokenizerBase,
)

from llmfoundry.utils.exceptions import ProbablyNetworkingError
from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM
from llmfoundry.models.utils import init_empty_weights
from llmfoundry.utils.huggingface_hub_utils import \
Expand Down Expand Up @@ -296,7 +297,13 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None:
+ f'Got {type(state.model)} instead.',
)
if self.remote_ud is not None:
self.remote_ud.init(state, logger)
try:
self.remote_ud.init(state, logger)
except PermissionError as e:
log.error(
f'Error initializing remote uploader/downloader. This is likely private link related. {e}',
)
raise ProbablyNetworkingError(is_contd_pretrain=state.run_name.startswith('contd-pretrain'))
mattyding marked this conversation as resolved.
Show resolved Hide resolved
state.callbacks.append(self.remote_ud)

if self.mlflow_registered_model_name is not None:
Expand Down
9 changes: 9 additions & 0 deletions llmfoundry/utils/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
'MisconfiguredHfDatasetError',
'DatasetTooSmallError',
'RunTimeoutError',
'ProbablyNetworkingError',
]

ALLOWED_RESPONSE_KEYS = {'response', 'completion'}
Expand Down Expand Up @@ -524,3 +525,11 @@ def __reduce__(self):

def __str__(self):
return self.message


class ProbablyNetworkingError(UserError):
mattyding marked this conversation as resolved.
Show resolved Hide resolved
"""Error thrown due to a networking restriction."""

def __init__(self, is_contd_pretrain: bool) -> None:
message = f"An error occurred while trying to access a remote URL. This is likely due to some networking restriction such as private link or SEG."
super().__init__(message, is_contd_pretrain=is_contd_pretrain)
Loading