Skip to content

Commit

Permalink
Prevent excessive uncached readlink's when using a symlinked workdir (#…
Browse files Browse the repository at this point in the history
…9270)

Problem
Extracting tarball artifacts into a FUSE context leads to excessive readlink calls on the pants workdir symlink. Readlinks can be slow in FUSE filesystems, which may not cache readlink results. Running pants on a moderate size target can lead to 100's of thousands of readlink calls, which degrades performance perceptibly.

Solution
If the pants workdir is a symlink, cache the real path to the symlink and pass it to the tarball artifact extractor, to sidestep decompressing into the FUSE context.

Result
The accumulated syscall profile's look like:
*Before* Wall time: 05:29

...
  readlink====================================================>202344
...
*After* Wall time: 03:37

...
readlink=============>549
...
The speed up happens when building from a clean state, when all artifacts are pulled from a remote cache and decompressed into the workdir. The cold compile time, when artifacts are built and stored locally, is the same for before and after.
  • Loading branch information
Henry Fuller authored Mar 12, 2020
1 parent 2f6ccdc commit 86c22f7
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 22 deletions.
7 changes: 5 additions & 2 deletions src/python/pants/cache/artifact.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,11 @@ class TarballArtifact(Artifact):

# TODO: Expose `dereference` for tasks.
# https://github.com/pantsbuild/pants/issues/3961
def __init__(self, artifact_root, tarfile_, compression=9, dereference=True):
def __init__(
self, artifact_root, artifact_extraction_root, tarfile_, compression=9, dereference=True
):
super().__init__(artifact_root)
self.artifact_extraction_root = artifact_extraction_root
self._tarfile = tarfile_
self._compression = compression
self._dereference = dereference
Expand Down Expand Up @@ -113,7 +116,7 @@ def extract(self):
# after the extraction.
try:
self.NATIVE_BINARY.decompress_tarball(
self._tarfile.encode(), self._artifact_root.encode()
self._tarfile.encode(), self.artifact_extraction_root.encode()
)
except Exception as e:
raise ArtifactError("Extracting artifact failed:\n{}".format(e))
3 changes: 2 additions & 1 deletion src/python/pants/cache/artifact_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,13 @@ class ArtifactCache:
Subclasses implement the methods below to provide this functionality.
"""

def __init__(self, artifact_root):
def __init__(self, artifact_root, artifact_extraction_root=None):
"""Create an ArtifactCache.
All artifacts must be under artifact_root.
"""
self.artifact_root = artifact_root
self.artifact_extraction_root = artifact_extraction_root or artifact_root

def prune(self):
"""Prune stale cache files.
Expand Down
11 changes: 10 additions & 1 deletion src/python/pants/cache/cache_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,12 @@ def _do_create_artifact_cache(self, spec, action):
raise ValueError("compression_level must be an integer 1-9: {}".format(compression))

artifact_root = self._options.pants_workdir
# If the artifact root is a symlink it is more efficient to readlink the symlink
# only once in a FUSE context like VCFS. The artifact extraction root lets us extract
# artifacts directly side-stepping a VCFS lookup if in use.
artifact_extraction_root = (
os.readlink(artifact_root) if os.path.islink(artifact_root) else artifact_root
)

def create_local_cache(parent_path):
path = os.path.join(parent_path, self._cache_dirname)
Expand All @@ -361,6 +367,7 @@ def create_local_cache(parent_path):
)
return LocalArtifactCache(
artifact_root,
artifact_extraction_root,
path,
compression,
self._options.max_entries_per_target,
Expand All @@ -375,7 +382,9 @@ def create_remote_cache(remote_spec, local_cache):
best_url_selector = BestUrlSelector(
["{}/{}".format(url.rstrip("/"), self._cache_dirname) for url in urls]
)
local_cache = local_cache or TempLocalArtifactCache(artifact_root, compression)
local_cache = local_cache or TempLocalArtifactCache(
artifact_root, artifact_extraction_root, compression
)
return RESTfulArtifactCache(
artifact_root,
best_url_selector,
Expand Down
30 changes: 25 additions & 5 deletions src/python/pants/cache/local_artifact_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,23 +20,35 @@


class BaseLocalArtifactCache(ArtifactCache):
def __init__(self, artifact_root, compression, permissions=None, dereference=True):
def __init__(
self,
artifact_root,
artifact_extraction_root,
compression,
permissions=None,
dereference=True,
):
"""
:param str artifact_root: The path under which cacheable products will be read/written.
:param str artifact_extraction_root: The path to where we should extract artifacts. Usually a reified artifact_path.
:param int compression: The gzip compression level for created artifacts.
Valid values are 0-9.
:param str permissions: File permissions to use when creating artifact files.
:param bool dereference: Dereference symlinks when creating the cache tarball.
"""
super().__init__(artifact_root)
super().__init__(artifact_root, artifact_extraction_root)
self._compression = compression
self._cache_root = None
self._permissions = permissions
self._dereference = dereference

def _artifact(self, path):
return TarballArtifact(
self.artifact_root, path, self._compression, dereference=self._dereference
self.artifact_root,
self.artifact_extraction_root,
path,
self._compression,
dereference=self._dereference,
)

@contextmanager
Expand Down Expand Up @@ -101,6 +113,7 @@ class LocalArtifactCache(BaseLocalArtifactCache):
def __init__(
self,
artifact_root,
artifact_extraction_root,
cache_root,
compression,
max_entries_per_target=None,
Expand All @@ -109,6 +122,7 @@ def __init__(
):
"""
:param str artifact_root: The path under which cacheable products will be read/written.
:param str artifact_extraction_root: The path to where we should extract artifacts. Usually a reified artifact_path.
:param str cache_root: The locally cached files are stored under this directory.
:param int compression: The gzip compression level for created artifacts (1-9 or false-y).
:param int max_entries_per_target: The maximum number of old cache files to leave behind on a cache miss.
Expand All @@ -117,6 +131,7 @@ def __init__(
"""
super().__init__(
artifact_root,
artifact_extraction_root,
compression,
permissions=int(permissions.strip(), base=8) if permissions else None,
dereference=dereference,
Expand Down Expand Up @@ -192,11 +207,16 @@ class TempLocalArtifactCache(BaseLocalArtifactCache):
calls, but is useful for handling file IO for a remote cache.
"""

def __init__(self, artifact_root, compression, permissions=None):
def __init__(self, artifact_root, artifact_extraction_root, compression, permissions=None):
"""
:param str artifact_root: The path under which cacheable products will be read/written.
"""
super().__init__(artifact_root, compression=compression, permissions=permissions)
super().__init__(
artifact_root,
artifact_extraction_root,
compression=compression,
permissions=permissions,
)

def _store_tarball(self, cache_key, src):
return src
Expand Down
20 changes: 15 additions & 5 deletions tests/python/pants_test/cache/test_artifact.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ def test_get_paths_after_collect(self):

file_path = self.touch_file_in(artifact_root)

artifact = TarballArtifact(artifact_root, os.path.join(cache_root, "some.tar"))
artifact = TarballArtifact(
artifact_root, artifact_root, os.path.join(cache_root, "some.tar")
)
artifact.collect([file_path])

self.assertEqual([file_path], list(artifact.get_paths()))
Expand All @@ -36,7 +38,9 @@ def test_does_not_exist_when_no_tar_file(self):
cache_root = os.path.join(tmpdir, "cache")
safe_mkdir(cache_root)

artifact = TarballArtifact(artifact_root, os.path.join(cache_root, "some.tar"))
artifact = TarballArtifact(
artifact_root, artifact_root, os.path.join(cache_root, "some.tar")
)
self.assertFalse(artifact.exists())

def test_exists_true_when_exists(self):
Expand All @@ -47,21 +51,27 @@ def test_exists_true_when_exists(self):

path = self.touch_file_in(artifact_root)

artifact = TarballArtifact(artifact_root, os.path.join(cache_root, "some.tar"))
artifact = TarballArtifact(
artifact_root, artifact_root, os.path.join(cache_root, "some.tar")
)
artifact.collect([path])

self.assertTrue(artifact.exists())

def test_non_existent_tarball_extraction(self):
with temporary_dir() as tmpdir:
artifact = TarballArtifact(artifact_root=tmpdir, tarfile_="vapor.tar")
artifact = TarballArtifact(
artifact_root=tmpdir, artifact_extraction_root=tmpdir, tarfile_="vapor.tar"
)
with self.assertRaises(ArtifactError):
artifact.extract()

def test_corrupt_tarball_extraction(self):
with temporary_dir() as tmpdir:
path = self.touch_file_in(tmpdir, content="invalid")
artifact = TarballArtifact(artifact_root=tmpdir, tarfile_=path)
artifact = TarballArtifact(
artifact_root=tmpdir, artifact_extraction_root=tmpdir, tarfile_=path
)
with self.assertRaises(ArtifactError):
artifact.extract()

Expand Down
31 changes: 23 additions & 8 deletions tests/python/pants_test/cache/test_artifact_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,16 @@ def subsystems(cls):
return super().subsystems + (RequestsSession.Factory,)

@contextmanager
def setup_local_cache(self):
def setup_local_cache(self, seperate_extraction_root=False):
with temporary_dir() as artifact_root:
with temporary_dir() as cache_root:
yield LocalArtifactCache(artifact_root, cache_root, compression=1)
with temporary_dir() as artifact_extraction_root:
with temporary_dir() as cache_root:
extraction_root = (
artifact_extraction_root if seperate_extraction_root else artifact_root
)
yield LocalArtifactCache(
artifact_root, extraction_root, cache_root, compression=1
)

@contextmanager
def setup_server(self, return_failed=False, cache_root=None):
Expand All @@ -49,7 +55,7 @@ def setup_server(self, return_failed=False, cache_root=None):
@contextmanager
def setup_rest_cache(self, local=None, return_failed=False):
with temporary_dir() as artifact_root:
local = local or TempLocalArtifactCache(artifact_root, 0)
local = local or TempLocalArtifactCache(artifact_root, artifact_root, 0)
with self.setup_server(return_failed=return_failed) as server:
yield RESTfulArtifactCache(artifact_root, BestUrlSelector([server.url]), local)

Expand Down Expand Up @@ -89,6 +95,10 @@ def test_local_cache(self):
with self.setup_local_cache() as artifact_cache:
self.do_test_artifact_cache(artifact_cache)

def test_local_cache_with_seperate_extraction_root(self):
with self.setup_local_cache(seperate_extraction_root=True) as artifact_cache:
self.do_test_artifact_cache(artifact_cache)

@pytest.mark.flaky(retries=1) # https://github.com/pantsbuild/pants/issues/6838
def test_restful_cache(self):
with self.assertRaises(InvalidRESTfulCacheProtoError):
Expand All @@ -102,7 +112,7 @@ def test_restful_cache_failover(self):
bad_url = "http://badhost:123"

with temporary_dir() as artifact_root:
local = TempLocalArtifactCache(artifact_root, 0)
local = TempLocalArtifactCache(artifact_root, artifact_root, 0)

# With fail-over, rest call second time will succeed
with self.setup_server() as good_server:
Expand Down Expand Up @@ -134,7 +144,10 @@ def do_test_artifact_cache(self, artifact_cache):
self.assertTrue(bool(artifact_cache.use_cached_files(key)))

# Check that it was recovered correctly.
with open(path, "rb") as infile:
extracted_file_path = os.path.join(
artifact_cache.artifact_extraction_root, os.path.basename(path)
)
with open(extracted_file_path, "rb") as infile:
content = infile.read()
self.assertEqual(content, TEST_CONTENT1)

Expand All @@ -146,7 +159,7 @@ def test_local_backed_remote_cache(self):
"""make sure that the combined cache finds what it should and that it backfills."""
with self.setup_server() as server:
with self.setup_local_cache() as local:
tmp = TempLocalArtifactCache(local.artifact_root, 0)
tmp = TempLocalArtifactCache(local.artifact_root, local.artifact_extraction_root, 0)
remote = RESTfulArtifactCache(
local.artifact_root, BestUrlSelector([server.url]), tmp
)
Expand Down Expand Up @@ -194,7 +207,9 @@ def test_local_backed_remote_cache_corrupt_artifact(self):
with temporary_dir() as remote_cache_dir:
with self.setup_server(cache_root=remote_cache_dir) as server:
with self.setup_local_cache() as local:
tmp = TempLocalArtifactCache(local.artifact_root, compression=1)
tmp = TempLocalArtifactCache(
local.artifact_root, local.artifact_extraction_root, compression=1
)
remote = RESTfulArtifactCache(
local.artifact_root, BestUrlSelector([server.url]), tmp
)
Expand Down

0 comments on commit 86c22f7

Please sign in to comment.