Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support local filesystem remotes for projects #11762

Merged
merged 9 commits into from
Nov 29, 2022
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ wasabi>=0.9.1,<1.1.0
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.8.0
pathy>=0.3.5
pathy>=0.10.0
smart-open>=5.2.1,<7.0.0
# Third party dependencies
numpy>=1.15.0
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ install_requires =
catalogue>=2.0.6,<2.1.0
# Third-party dependencies
typer>=0.3.0,<0.8.0
pathy>=0.3.5
pathy>=0.10.0
smart-open>=5.2.1,<7.0.0
tqdm>=4.38.0,<5.0.0
numpy>=1.15.0
Expand Down
15 changes: 11 additions & 4 deletions spacy/cli/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from .. import about

if TYPE_CHECKING:
from pathy import Pathy # noqa: F401
from pathy import FluidPath # noqa: F401


SDIST_SUFFIX = ".tar.gz"
Expand Down Expand Up @@ -331,21 +331,28 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)


def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
def upload_file(src: Path, dest: Union[str, "FluidPath"]) -> None:
"""Upload a file.

src (Path): The source path.
url (str): The destination URL to upload to.
"""
import smart_open

# Create parent directories for local paths
if isinstance(dest, Path):
if not dest.parent.exists():
dest.parent.mkdir(parents=True)

dest = str(dest)
with smart_open.open(dest, mode="wb") as output_file:
with src.open(mode="rb") as input_file:
output_file.write(input_file.read())


def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) -> None:
def download_file(
src: Union[str, "FluidPath"], dest: Path, *, force: bool = False
) -> None:
"""Download a file using smart_open.

url (str): The URL of the file.
Expand All @@ -368,7 +375,7 @@ def ensure_pathy(path):
slow and annoying Google Cloud warning)."""
from pathy import Pathy # noqa: F811

return Pathy(path)
return Pathy.fluid(path)


def git_checkout(
Expand Down
42 changes: 27 additions & 15 deletions spacy/cli/project/remote_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,17 @@
import urllib.parse
import tarfile
from pathlib import Path
from wasabi import msg

from .._util import get_hash, get_checksum, download_file, ensure_pathy
from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
from .._util import get_hash, get_checksum, upload_file, download_file
from .._util import ensure_pathy, make_tempdir
from ...util import get_minor_version, ENV_VARS, check_bool_env_var
from ...git_info import GIT_VERSION
from ... import about
from ...errors import Errors

if TYPE_CHECKING:
from pathy import Pathy # noqa: F401
from pathy import FluidPath # noqa: F401


class RemoteStorage:
Expand All @@ -28,7 +30,7 @@ def __init__(self, project_root: Path, url: str, *, compression="gz"):
self.url = ensure_pathy(url)
self.compression = compression

def push(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
"""Compress a file or directory within a project and upload it to a remote
storage. If an object exists at the full URL, nothing is done.

Expand All @@ -49,9 +51,7 @@ def push(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
mode_string = f"w:{self.compression}" if self.compression else "w"
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
tar_file.add(str(loc), arcname=str(path))
with tar_loc.open(mode="rb") as input_file:
with url.open(mode="wb") as output_file:
output_file.write(input_file.read())
upload_file(tar_loc, url)
return url

def pull(
Expand All @@ -60,7 +60,7 @@ def pull(
*,
command_hash: Optional[str] = None,
content_hash: Optional[str] = None,
) -> Optional["Pathy"]:
) -> Optional["FluidPath"]:
"""Retrieve a file from the remote cache. If the file already exists,
nothing is done.

Expand Down Expand Up @@ -110,25 +110,37 @@ def find(
*,
command_hash: Optional[str] = None,
content_hash: Optional[str] = None,
) -> Optional["Pathy"]:
) -> Optional["FluidPath"]:
"""Find the best matching version of a file within the storage,
or `None` if no match can be found. If both the creation and content hash
are specified, only exact matches will be returned. Otherwise, the most
recent matching file is preferred.
"""
name = self.encode_name(str(path))
urls = []
if command_hash is not None and content_hash is not None:
url = self.make_url(path, command_hash, content_hash)
url = self.url / name / command_hash / content_hash
urls = [url] if url.exists() else []
elif command_hash is not None:
urls = list((self.url / name / command_hash).iterdir())
if (self.url / name / command_hash).exists():
urls = list((self.url / name / command_hash).iterdir())
else:
urls = list((self.url / name).iterdir())
if content_hash is not None:
urls = [url for url in urls if url.parts[-1] == content_hash]
if (self.url / name).exists():
for sub_dir in (self.url / name).iterdir():
urls.extend(sub_dir.iterdir())
if content_hash is not None:
urls = [url for url in urls if url.parts[-1] == content_hash]
if len(urls) >= 2:
try:
urls.sort(key=lambda x: x.stat().last_modified) # type: ignore
except Exception:
msg.warn(
"Unable to sort remote files by last modified. The file(s) "
"pulled from the cache may not be the most recent."
)
return urls[-1] if urls else None

def make_url(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
"""Construct a URL from a subpath, a creation hash and a content hash."""
return self.url / self.encode_name(str(path)) / command_hash / content_hash

Expand Down
56 changes: 56 additions & 0 deletions spacy/tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from collections import Counter
from typing import Tuple, List, Dict, Any
import pkg_resources
import time

import numpy
import pytest
Expand All @@ -28,6 +29,7 @@
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
from spacy.cli.package import get_third_party_dependencies
from spacy.cli.package import _is_permitted_package_name
from spacy.cli.project.remote_storage import RemoteStorage
from spacy.cli.project.run import _check_requirements
from spacy.cli.validate import get_model_pkgs
from spacy.cli.find_threshold import find_threshold
Expand Down Expand Up @@ -862,6 +864,60 @@ def test_span_length_freq_dist_output_must_be_correct():
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]


def test_local_remote_storage():
with make_tempdir() as d:
filename = "a.txt"

content_hashes = ("aaaa", "cccc", "bbbb")
for i, content_hash in enumerate(content_hashes):
# make sure that each subsequent file has a later timestamp
if i > 0:
time.sleep(1)
content = f"{content_hash} content"
loc_file = d / "root" / filename
if not loc_file.parent.exists():
loc_file.parent.mkdir(parents=True)
with loc_file.open(mode="w") as file_:
file_.write(content)

# push first version to remote storage
remote = RemoteStorage(d / "root", str(d / "remote"))
remote.push(filename, "aaaa", content_hash)

# retrieve with full hashes
loc_file.unlink()
remote.pull(filename, command_hash="aaaa", content_hash=content_hash)
with loc_file.open(mode="r") as file_:
assert file_.read() == content

# retrieve with command hash
loc_file.unlink()
remote.pull(filename, command_hash="aaaa")
with loc_file.open(mode="r") as file_:
assert file_.read() == content

# retrieve with content hash
loc_file.unlink()
remote.pull(filename, content_hash=content_hash)
with loc_file.open(mode="r") as file_:
assert file_.read() == content

# retrieve with no hashes
loc_file.unlink()
remote.pull(filename)
with loc_file.open(mode="r") as file_:
assert file_.read() == content


def test_local_remote_storage_pull_missing():
# pulling from a non-existent remote pulls nothing gracefully
with make_tempdir() as d:
filename = "a.txt"
remote = RemoteStorage(d / "root", str(d / "remote"))
assert remote.pull(filename, command_hash="aaaa") is None
assert remote.pull(filename) is None


def test_cli_find_threshold(capsys):
thresholds = numpy.linspace(0, 1, 10)

Expand Down
26 changes: 14 additions & 12 deletions website/docs/api/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -1391,12 +1391,13 @@ If the contents are different, the new version of the file is uploaded. Deleting
obsolete files is left up to you.

Remotes can be defined in the `remotes` section of the
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses the
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to
communicate with the remote storages, so you can use any protocol that
`smart-open` supports, including [S3](https://aws.amazon.com/s3/),
[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although
you may need to install extra dependencies to use certain protocols.
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses
[`Pathy`](https://github.com/justindujardin/pathy) to communicate with the
remote storages, so you can use any protocol that `Pathy` supports, including
[S3](https://aws.amazon.com/s3/),
[Google Cloud Storage](https://cloud.google.com/storage), and the local
filesystem, although you may need to install extra dependencies to use certain
protocols.

```cli
$ python -m spacy project push [remote] [project_dir]
Expand Down Expand Up @@ -1435,12 +1436,13 @@ outputs, so if you change the config back, you'll be able to fetch back the
result.

Remotes can be defined in the `remotes` section of the
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses the
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to
communicate with the remote storages, so you can use any protocol that
`smart-open` supports, including [S3](https://aws.amazon.com/s3/),
[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although
you may need to install extra dependencies to use certain protocols.
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses
[`Pathy`](https://github.com/justindujardin/pathy) to communicate with the
remote storages, so you can use any protocol that `Pathy` supports, including
[S3](https://aws.amazon.com/s3/),
[Google Cloud Storage](https://cloud.google.com/storage), and the local
filesystem, although you may need to install extra dependencies to use certain
protocols.

```cli
$ python -m spacy project pull [remote] [project_dir]
Expand Down
20 changes: 10 additions & 10 deletions website/docs/usage/projects.md
Original file line number Diff line number Diff line change
Expand Up @@ -259,9 +259,9 @@ pipelines.
> This can be used in a project command like so:
>
> ```yaml
> - name: "echo-path"
> script:
> - "echo ${env.ENV_PATH}"
> - name: 'echo-path'
> script:
> - 'echo ${env.ENV_PATH}'
> ```

| Section | Description |
Expand Down Expand Up @@ -643,12 +643,13 @@ locally.

You can list one or more remotes in the `remotes` section of your
[`project.yml`](#project-yml) by mapping a string name to the URL of the
storage. Under the hood, spaCy uses the
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to
communicate with the remote storages, so you can use any protocol that
`smart-open` supports, including [S3](https://aws.amazon.com/s3/),
[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although
you may need to install extra dependencies to use certain protocols.
storage. Under the hood, spaCy uses
[`Pathy`](https://github.com/justindujardin/pathy) to communicate with the
remote storages, so you can use any protocol that `Pathy` supports, including
[S3](https://aws.amazon.com/s3/),
[Google Cloud Storage](https://cloud.google.com/storage), and the local
filesystem, although you may need to install extra dependencies to use certain
protocols.

> #### Example
>
Expand All @@ -661,7 +662,6 @@ you may need to install extra dependencies to use certain protocols.
remotes:
default: 's3://my-spacy-bucket'
local: '/mnt/scratch/cache'
stuff: 'ssh://myserver.example.com/whatever'
```

<Infobox title="How it works" emoji="💡">
Expand Down