Skip to content

Commit

Permalink
Adding appropriate error when inputing URL's in add_files function (i…
Browse files Browse the repository at this point in the history
…nstead of using add_external_files)
  • Loading branch information
morzahavi committed Sep 25, 2024
1 parent eaeadb1 commit 50ad7b0
Showing 1 changed file with 65 additions and 48 deletions.
113 changes: 65 additions & 48 deletions clearml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,58 +380,75 @@ def add_tags(self, tags):
:param tags: A list of tags which describe the Task to add.
"""
self._task.add_tags(tags)
import re

def add_files(
self,
path, # type: Union[str, Path, _Path]
wildcard=None, # type: Optional[Union[str, Sequence[str]]]
local_base_folder=None, # type: Optional[str]
dataset_path=None, # type: Optional[str]
recursive=True, # type: bool
verbose=False, # type: bool
max_workers=None, # type: Optional[int]
):
# type: (...) -> ()
"""
Add a folder into the current dataset. calculate file hash,
and compare against parent, mark files to be uploaded
:param path: Add a folder/file to the dataset
:param wildcard: add only specific set of files.
Wildcard matching, can be a single string or a list of wildcards.
:param local_base_folder: files will be located based on their relative path from local_base_folder
:param dataset_path: where in the dataset the folder/files should be located
:param recursive: If True, match all wildcard files recursively
:param verbose: If True, print to console files added/modified
:param max_workers: The number of threads to add the files with. Defaults to the number of logical cores
:return: number of files added
"""
max_workers = max_workers or psutil.cpu_count()
self._dirty = True
self._task.get_logger().report_text(
'Adding files to dataset: {}'.format(
dict(path=path, wildcard=wildcard, local_base_folder=local_base_folder,
dataset_path=dataset_path, recursive=recursive, verbose=verbose)),
print_console=False)
def is_url(path):
"""
Helper function to check if the provided path is an external URL (e.g., s3://, http://).
"""
url_regex = re.compile(
r'^(?:http|ftp|s3|gs|azure)://' # schemes: http, ftp, s3, gs, azure
)
return url_regex.match(path) is not None

num_added, num_modified = self._add_files(
path=path,
wildcard=wildcard,
local_base_folder=local_base_folder,
dataset_path=dataset_path,
recursive=recursive,
verbose=verbose,
max_workers=max_workers,
def add_files(
self,
path, # type: Union[str, Path, _Path]
wildcard=None, # type: Optional[Union[str, Sequence[str]]]
local_base_folder=None, # type: Optional[str]
dataset_path=None, # type: Optional[str]
recursive=True, # type: bool
verbose=False, # type: bool
max_workers=None, # type: Optional[int]
):
# type: (...) -> ()
"""
Add a folder into the current dataset. calculate file hash,
and compare against parent, mark files to be uploaded
:param path: Add a folder/file to the dataset
:param wildcard: add only specific set of files.
Wildcard matching, can be a single string or a list of wildcards.
:param local_base_folder: files will be located based on their relative path from local_base_folder
:param dataset_path: where in the dataset the folder/files should be located
:param recursive: If True, match all wildcard files recursively
:param verbose: If True, print to console files added/modified
:param max_workers: The number of threads to add the files with. Defaults to the number of logical cores
:return: number of files added
"""
# Check if the path provided is a URL, if so, raise an error and suggest using add_external_files
if is_url(path):
raise ValueError(
"The path provided seems to be an external URL (e.g., s3://, http://). "
"Please use `add_external_files()` to add external files to the dataset."
)

# update the task script
self._add_script_call(
'add_files', path=path, wildcard=wildcard, local_base_folder=local_base_folder,
dataset_path=dataset_path, recursive=recursive)

self._serialize()

return num_added
max_workers = max_workers or psutil.cpu_count()
self._dirty = True
self._task.get_logger().report_text(
'Adding files to dataset: {}'.format(
dict(path=path, wildcard=wildcard, local_base_folder=local_base_folder,
dataset_path=dataset_path, recursive=recursive, verbose=verbose)),
print_console=False)

num_added, num_modified = self._add_files(
path=path,
wildcard=wildcard,
local_base_folder=local_base_folder,
dataset_path=dataset_path,
recursive=recursive,
verbose=verbose,
max_workers=max_workers,
)

# update the task script
self._add_script_call(
'add_files', path=path, wildcard=wildcard, local_base_folder=local_base_folder,
dataset_path=dataset_path, recursive=recursive)

self._serialize()

return num_added

def add_external_files(
self,
Expand Down

0 comments on commit 50ad7b0

Please sign in to comment.