Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cloud implementation of shutil.copy and shutil.copytree #142

Merged
merged 11 commits into from
May 29, 2021
6 changes: 5 additions & 1 deletion HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
# cloudpathlib Changelog

## v0.4.1 (unreleased)
## v0.4.1 (2021-05-29)

- Added support for custom S3-compatible object stores. This functionality is available via the `endpoint_url` keyword argument when instantiating an `S3Client` instance. See [documentation](https://cloudpathlib.drivendata.org/authentication/#accessing-custom-s3-compatible-object-stores) for more details. ([#138](https://github.com/drivendataorg/cloudpathlib/pull/138) thanks to [@YevheniiSemendiak](https://github.com/YevheniiSemendiak))
- Added `CloudPath.upload_from` which uploads the passed path to this CloudPath (issuse [#58](https://github.com/drivendataorg/cloudpathlib/issues/58))
- Added support for common file transfer functions based on `shutil`. Issue [#108](https://github.com/drivendataorg/cloudpathlib/issues/108). PR [#142](https://github.com/drivendataorg/cloudpathlib/pull/142).
- `CloudPath.copy` copy a file from one location to another. Can be cloud -> local or cloud -> cloud. If `client` is not the same, the file transits through the local machine.
- `CloudPath.copytree` reucrsively copy a directory from one location to another. Can be cloud -> local or cloud -> cloud. Uses `CloudPath.copy` so if `client` is not the same, the file transits through the local machine.

## v0.4.0 (2021-03-13)

Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -179,11 +179,14 @@ Most methods and properties from `pathlib.Path` are supported except for the one
| `symlink_to` | ❌ | ❌ | ❌ |
| `with_stem` | ❌ | ❌ | ❌ |
| `cloud_prefix` | ✅ | ✅ | ✅ |
| `copy` | ✅ | ✅ | ✅ |
| `copytree` | ✅ | ✅ | ✅ |
| `download_to` | ✅ | ✅ | ✅ |
| `etag` | ✅ | ✅ | ✅ |
| `fspath` | ✅ | ✅ | ✅ |
| `is_valid_cloudpath` | ✅ | ✅ | ✅ |
| `rmtree` | ✅ | ✅ | ✅ |
| `upload_from` | ✅ | ✅ | ✅ |
| `blob` | ✅ | ❌ | ✅ |
| `bucket` | ❌ | ✅ | ✅ |
| `container` | ✅ | ❌ | ❌ |
Expand Down
12 changes: 12 additions & 0 deletions cloudpathlib/anypath.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from pathlib import Path
from typing import Union

Expand Down Expand Up @@ -52,3 +53,14 @@ def _validate(cls, value) -> Union[CloudPath, Path]:
https://pydantic-docs.helpmanual.io/usage/types/#custom-data-types"""
# Note __new__ is static method and not a class method
return cls.__new__(cls, value)


def to_anypath(s: Union[str, os.PathLike]) -> Union[CloudPath, Path]:
"""Convenience method to convert a str or os.PathLike to the
proper Path or CloudPath object using AnyPath.
"""
# shortcut pathlike items that are already valid Path/CloudPath
if isinstance(s, (CloudPath, Path)):
return s

return AnyPath(s) # type: ignore
pjbull marked this conversation as resolved.
Show resolved Hide resolved
9 changes: 6 additions & 3 deletions cloudpathlib/azure/azblobclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def _get_metadata(self, cloud_path: AzureBlobPath) -> Dict[str, Any]:

def _download_file(
self, cloud_path: AzureBlobPath, local_path: Union[str, os.PathLike]
) -> Union[str, os.PathLike]:
) -> Path:
blob = self.service_client.get_blob_client(
container=cloud_path.container, blob=cloud_path.blob
)
Expand Down Expand Up @@ -171,7 +171,9 @@ def _list_dir(

yield self.CloudPath(f"az://{cloud_path.container}/{o.name}")

def _move_file(self, src: AzureBlobPath, dst: AzureBlobPath) -> AzureBlobPath:
def _move_file(
self, src: AzureBlobPath, dst: AzureBlobPath, remove_src: bool = True
) -> AzureBlobPath:
# just a touch, so "REPLACE" metadata
if src == dst:
blob_client = self.service_client.get_blob_client(
Expand All @@ -189,7 +191,8 @@ def _move_file(self, src: AzureBlobPath, dst: AzureBlobPath) -> AzureBlobPath:

target.start_copy_from_url(source.url)

self._remove(src)
if remove_src:
self._remove(src)

return dst

Expand Down
6 changes: 4 additions & 2 deletions cloudpathlib/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def CloudPath(self, cloud_path: Union[str, BoundedCloudPath]) -> BoundedCloudPat
@abc.abstractmethod
def _download_file(
self, cloud_path: BoundedCloudPath, local_path: Union[str, os.PathLike]
) -> Union[str, os.PathLike]:
) -> Path:
pass

@abc.abstractmethod
Expand All @@ -83,7 +83,9 @@ def _list_dir(
pass

@abc.abstractmethod
def _move_file(self, src: BoundedCloudPath, dst: BoundedCloudPath) -> BoundedCloudPath:
def _move_file(
self, src: BoundedCloudPath, dst: BoundedCloudPath, remove_src: bool = True
pjbull marked this conversation as resolved.
Show resolved Hide resolved
) -> BoundedCloudPath:
pass

@abc.abstractmethod
Expand Down
141 changes: 129 additions & 12 deletions cloudpathlib/cloudpath.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from urllib.parse import urlparse
from warnings import warn

from . import anypath

from .exceptions import (
ClientMismatchError,
CloudPathFileExistsError,
Expand Down Expand Up @@ -585,12 +587,12 @@ def read_text(self):
return self._dispatch_to_local_cache_path("read_text")

# =========== public cloud methods, not in pathlib ===============
def download_to(self, destination: Union[str, os.PathLike]):
def download_to(self, destination: Union[str, os.PathLike]) -> Path:
destination = Path(destination)
if self.is_file():
if destination.is_dir():
destination = destination / self.name
self.client._download_file(self, destination)
return self.client._download_file(self, destination)
else:
destination.mkdir(exist_ok=True)
for f in self.iterdir():
Expand All @@ -601,6 +603,8 @@ def download_to(self, destination: Union[str, os.PathLike]):
rel_dest = str(f)[len(rel) :]
f.download_to(destination / rel_dest)

return destination

def rmtree(self):
"""Delete an entire directory tree."""
if self.is_file():
Expand All @@ -609,6 +613,108 @@ def rmtree(self):
)
self.client._remove(self)

def upload_from(
self, source: Union[str, os.PathLike], force_overwrite_to_cloud: bool = False
pjbull marked this conversation as resolved.
Show resolved Hide resolved
) -> "CloudPath":
"""Upload a file or directory to the cloud path."""
source = Path(source)

if source.is_dir():
for p in source.iterdir():
(self / p.name).upload_from(p, force_overwrite_to_cloud=force_overwrite_to_cloud)

return self

else:
if self.exists() and self.is_dir():
dst = self / source.name
else:
dst = self

dst._upload_file_to_cloud(source, force_overwrite_to_cloud=force_overwrite_to_cloud)

return dst

def copy(
self,
destination: Union[str, os.PathLike, "CloudPath"],
force_overwrite_to_cloud: bool = False,
) -> Union[Path, "CloudPath"]:
"""Copy self to destination folder of file, if self is a file."""
if not self.exists() or not self.is_file():
raise ValueError(
f"Path {self} should be a file. To copy a directory tree use the method copytree."
)

# handle string version of cloud paths + local paths
if isinstance(destination, (str, os.PathLike)):
destination = anypath.to_anypath(destination)

if not isinstance(destination, CloudPath):
pjbull marked this conversation as resolved.
Show resolved Hide resolved
return self.download_to(destination)

# if same client, use cloud-native _move_file on client to avoid downloading
elif self.client is destination.client:
if destination.exists() and destination.is_dir():
destination: CloudPath = destination / self.name # type: ignore

if (
not force_overwrite_to_cloud
and destination.exists()
and destination.stat().st_mtime >= self.stat().st_mtime
):
raise OverwriteNewerCloudError(
f"File ({destination}) is newer than ({self}). "
f"To overwrite "
f"pass `force_overwrite_to_cloud=True`."
)

return self.client._move_file(self, destination, remove_src=False)
pjbull marked this conversation as resolved.
Show resolved Hide resolved

else:
if not destination.exists() or destination.is_file():
return destination.upload_from(
self.fspath, force_overwrite_to_cloud=force_overwrite_to_cloud
)
else:
return (destination / self.name).upload_from(
self.fspath, force_overwrite_to_cloud=force_overwrite_to_cloud
)

def copytree(
self,
destination: Union[str, os.PathLike, "CloudPath"],
force_overwrite_to_cloud: bool = False,
) -> Union[Path, "CloudPath"]:
"""Copy self to a directory, if self is a directory."""
if not self.is_dir():
raise CloudPathNotADirectoryError(
f"Origin path {self} must be a directory. To copy a single file use the method copy."
)

# handle string version of cloud paths + local paths
if isinstance(destination, (str, os.PathLike)):
destination = anypath.to_anypath(destination)

if destination.exists() and destination.is_file():
raise CloudPathFileExistsError(
"Destination path {destination} of copytree must be a directory."
)

destination.mkdir(parents=True, exist_ok=True)
pjbull marked this conversation as resolved.
Show resolved Hide resolved

for subpath in self.iterdir():
if subpath.is_file():
subpath.copy(
destination / subpath.name, force_overwrite_to_cloud=force_overwrite_to_cloud
)
elif subpath.is_dir():
subpath.copytree(
destination / subpath.name, force_overwrite_to_cloud=force_overwrite_to_cloud
)

return destination

# =========== private cloud methods ===============
@property
def _local(self):
Expand Down Expand Up @@ -673,11 +779,30 @@ def _refresh_cache(self, force_overwrite_from_cloud=False):
)

def _upload_local_to_cloud(self, force_overwrite_to_cloud: bool = False):
"""Uploads cache file at self._local to the cloud"""
# We should never try to be syncing entire directories; we should only
# cache and upload individual files.
if self._local.is_dir():
raise ValueError("Only individual files can be uploaded to the cloud")

uploaded = self._upload_file_to_cloud(
self._local, force_overwrite_to_cloud=force_overwrite_to_cloud
)

# force cache time to match cloud times
stats = self.stat()
os.utime(self._local, times=(stats.st_mtime, stats.st_mtime))

# reset dirty and handle now that this is uploaded
self._dirty = False
self._handle = None

return uploaded

def _upload_file_to_cloud(self, local_path, force_overwrite_to_cloud: bool = False):
"""Uploads file at `local_path` to the cloud if there is not a newer file
already there.
"""
try:
stats = self.stat()
except NoStatError:
Expand All @@ -686,22 +811,14 @@ def _upload_local_to_cloud(self, force_overwrite_to_cloud: bool = False):
# if cloud does not exist or local is newer or we are overwriting, do the upload
if (
not stats # cloud does not exist
or (self._local.stat().st_mtime > stats.st_mtime)
or (local_path.stat().st_mtime > stats.st_mtime)
or force_overwrite_to_cloud
):
self.client._upload_file(
self._local,
local_path,
self,
)

# force cache time to match cloud times
stats = self.stat()
os.utime(self._local, times=(stats.st_mtime, stats.st_mtime))

# reset dirty and handle now that this is uploaded
self._dirty = False
self._handle = None

return self

# cloud is newer and we are not overwriting
Expand Down
14 changes: 8 additions & 6 deletions cloudpathlib/gs/gsclient.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from datetime import datetime
import os
from pathlib import PurePosixPath
from pathlib import Path, PurePosixPath
from typing import Any, Dict, Iterable, Optional, TYPE_CHECKING, Union

from ..client import Client, register_client_class
Expand Down Expand Up @@ -91,12 +91,12 @@ def _get_metadata(self, cloud_path: GSPath) -> Optional[Dict[str, Any]]:
"updated": blob.updated,
}

def _download_file(
self, cloud_path: GSPath, local_path: Union[str, os.PathLike]
) -> Union[str, os.PathLike]:
def _download_file(self, cloud_path: GSPath, local_path: Union[str, os.PathLike]) -> Path:
bucket = self.client.bucket(cloud_path.bucket)
blob = bucket.get_blob(cloud_path.blob)

local_path = Path(local_path)

blob.download_to_filename(local_path)
return local_path

Expand Down Expand Up @@ -158,7 +158,7 @@ def _list_dir(self, cloud_path: GSPath, recursive=False) -> Iterable[GSPath]:

yield self.CloudPath(f"gs://{cloud_path.bucket}/{o.name}")

def _move_file(self, src: GSPath, dst: GSPath) -> GSPath:
def _move_file(self, src: GSPath, dst: GSPath, remove_src: bool = True) -> GSPath:
# just a touch, so "REPLACE" metadata
if src == dst:
bucket = self.client.bucket(src.bucket)
Expand All @@ -177,7 +177,9 @@ def _move_file(self, src: GSPath, dst: GSPath) -> GSPath:

src_blob = src_bucket.get_blob(src.blob)
src_bucket.copy_blob(src_blob, dst_bucket, dst.blob)
src_blob.delete()

if remove_src:
src_blob.delete()

return dst

Expand Down
14 changes: 9 additions & 5 deletions cloudpathlib/local/localclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,7 @@ def _local_to_cloud_path(self, local_path: Union[str, os.PathLike]) -> "LocalPat
f"{cloud_prefix}{PurePosixPath(local_path.relative_to(self._local_storage_dir))}"
)

def _download_file(
self, cloud_path: "LocalPath", local_path: Union[str, os.PathLike]
) -> Union[str, os.PathLike]:
def _download_file(self, cloud_path: "LocalPath", local_path: Union[str, os.PathLike]) -> Path:
local_path = Path(local_path)
local_path.parent.mkdir(exist_ok=True, parents=True)
shutil.copyfile(self._cloud_path_to_local(cloud_path), local_path)
Expand Down Expand Up @@ -83,9 +81,15 @@ def _list_dir(self, cloud_path: "LocalPath", recursive=False) -> Iterable["Local
def _md5(self, cloud_path: "LocalPath") -> str:
return md5(self._cloud_path_to_local(cloud_path).read_bytes()).hexdigest()

def _move_file(self, src: "LocalPath", dst: "LocalPath") -> "LocalPath":
def _move_file(
self, src: "LocalPath", dst: "LocalPath", remove_src: bool = True
) -> "LocalPath":
self._cloud_path_to_local(dst).parent.mkdir(exist_ok=True, parents=True)
self._cloud_path_to_local(src).replace(self._cloud_path_to_local(dst))

if remove_src:
self._cloud_path_to_local(src).replace(self._cloud_path_to_local(dst))
else:
shutil.copy(self._cloud_path_to_local(src), self._cloud_path_to_local(dst))
return dst

def _remove(self, cloud_path: "LocalPath") -> None:
Expand Down
Loading