Skip to content

Commit

Permalink
[7.1.0] Optimize RemoteActionFileSystem#resolveSymbolicLinks by cachi…
Browse files Browse the repository at this point in the history
…ng intermediate results in a trie.

When scanning a filesystem tree, resolveSymbolicLinks does O(M*N) work, where M is the number of components in a file path and N is the number of files. This CL makes it O(M+N) instead. This makes large output tree artifacts (~250k files) much more efficient to scan (from ~45s to ~9s in a particular example; additional optimizations are possible and will be made in a followup CL).

Related to bazelbuild#17009.

PiperOrigin-RevId: 606259673
Change-Id: Icf781a78b3271196e0029e3049d969a9e6073906
  • Loading branch information
tjgq committed Feb 13, 2024
1 parent 3b2e649 commit 2912882
Show file tree
Hide file tree
Showing 6 changed files with 566 additions and 24 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
// Copyright 2024 The Bazel Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.devtools.build.lib.remote;

import static com.google.common.base.Preconditions.checkArgument;

import com.google.devtools.build.lib.vfs.FileSymlinkLoopException;
import com.google.devtools.build.lib.vfs.FileSystem;
import com.google.devtools.build.lib.vfs.PathFragment;
import java.io.IOException;
import java.util.Iterator;
import java.util.concurrent.ConcurrentHashMap;
import javax.annotation.Nullable;

/**
* Canonicalizes paths like {@link FileSystem#resolveSymbolicLinks}, while storing the intermediate
* results in a trie so they can be reused by future canonicalizations.
*
* <p>This is an implementation detail of {@link RemoteActionFileSystem}, factored out for testing.
* Because {@link RemoteActionFileSystem} implements a union filesystem and must account for the
* possibility of symlinks straddling the underlying filesystems, the performance of large
* filesystem scans can be greatly improved with a custom {@link FileSystem#resolveSymbolicLinks}
* implementation that leverages the trie to avoid repeated work.
*
* <p>On case-insensitive filesystems, accessing the same path through different case variations
* will produce distinct trie entries. This could be fixed, but it's a performance rather than a
* correctness concern, and shouldn't matter most of the time.
*
* <p>Thread-safe: concurrent calls to {@link #resolveSymbolicLinks} are supported. As with {@link
* FileSystem#resolveSymbolicLinks}, the result is undefined if the filesystem is mutated
* concurrently.
*/
final class PathCanonicalizer {

interface Resolver {
/**
* Returns the result of {@link FileSystem#readSymbolicLink} if the path is a symlink, otherwise
* null. All but the last path segment must be canonical.
*
* @throws IOException if the file type or symlink target path could not be determined
*/
@Nullable
PathFragment resolveOneLink(PathFragment path) throws IOException;
}

/** The trie node for a path segment. */
private static final class Node {
/**
* Maps the next segment to the corresponding node. Null iff the path ending with the current
* segment is a symlink.
*/
@Nullable private final ConcurrentHashMap<String, Node> tab;

/** The symlink target. Null iff the path ending with the current segment is not a symlink. */
@Nullable private final PathFragment targetPath;

private Node(@Nullable PathFragment targetPath) {
this.tab = targetPath == null ? new ConcurrentHashMap<>() : null;
this.targetPath = targetPath;
}
}

private final Resolver resolver;
private final Node root = new Node(null);

PathCanonicalizer(Resolver resolver) {
this.resolver = resolver;
}

/** Returns the root node for an absolute path. */
private Node getRootNode(PathFragment path) {
checkArgument(path.isAbsolute());
// Unix has a single root. Windows has one root per drive.
return path.getDriveStrLength() > 1
? root.tab.computeIfAbsent(path.getDriveStr(), unused -> new Node(null))
: root;
}

/**
* Canonicalizes a path, reusing cached information if possible.
*
* @param path the path to canonicalize.
* @param maxLinks the maximum number of symlinks that can be followed in the process of
* canonicalizing the path.
* @throws FileSymlinkLoopException if too many symlinks had to be followed.
* @throws IOException if an I/O error occurs
* @return the canonical path.
*/
private PathFragment resolveSymbolicLinks(PathFragment path, int maxLinks) throws IOException {
// This code is carefully written to be as fast as possible when the path is already canonical
// and has been previously cached. Avoid making changes without benchmarking. A tree artifact
// with hundreds of thousands of files makes for a good benchmark.

Node node = getRootNode(path);
Iterable<String> segments = path.segments();
int segmentIndex = 0;

// Loop invariants:
// - `segmentIndex` is the index of the current `segment` relative to the start of `path`. The
// first segment has index 0.
// - `path` is the absolute path to canonicalize. If `segmentIndex` > 0, `path` is already
// canonical up to and including `segmentIndex` - 1.
// - `node` is the trie node corresponding to the `path` prefix ending with `segmentIndex` - 1,
// or to the root path when `segmentIndex` is 0.
for (String segment : segments) {
Node nextNode = node.tab.get(segment);
if (nextNode == null) {
PathFragment naivePath = path.subFragment(0, segmentIndex + 1);
PathFragment targetPath = resolver.resolveOneLink(naivePath);
nextNode = node.tab.computeIfAbsent(segment, unused -> new Node(targetPath));
}

if (nextNode.targetPath != null) {
if (maxLinks == 0) {
throw new FileSymlinkLoopException(path);
}
maxLinks--;

// Compute the path obtained by resolving the symlink.
// Note that path normalization already handles uplevel references.
PathFragment newPath;
if (nextNode.targetPath.isAbsolute()) {
newPath = nextNode.targetPath.getRelative(path.subFragment(segmentIndex + 1));
} else {
newPath =
path.subFragment(0, segmentIndex)
.getRelative(nextNode.targetPath)
.getRelative(path.subFragment(segmentIndex + 1));
}

// For absolute symlinks, we must start over.
// For relative symlinks, it would have been possible to restart after the already
// canonicalized prefix, but they're too rare to be worth optimizing for.
return resolveSymbolicLinks(newPath, maxLinks);
}

node = nextNode;
segmentIndex++;
}

return path;
}

/**
* Canonicalizes a path, reusing cached information if possible.
*
* <p>See {@link FileSystem#resolveSymbolicLinks} for the full specification.
*
* @param path the path to canonicalize.
* @throws FileSymlinkLoopException if too many symlinks had to be followed.
* @throws IOException if an I/O error occurs
* @return the canonical path.
*/
PathFragment resolveSymbolicLinks(PathFragment path) throws IOException {
return resolveSymbolicLinks(path, FileSystem.MAX_SYMLINKS);
}

/** Removes cached information for a path prefix. */
void clearPrefix(PathFragment pathPrefix) {
Node node = getRootNode(pathPrefix);
Iterator<String> segments = pathPrefix.segments().iterator();
boolean hasNext = segments.hasNext();

while (node != null && hasNext) {
String segment = segments.next();
hasNext = segments.hasNext();

if (!hasNext) {
// Found the path prefix.
node.tab.remove(segment);
return;
}

if (node.targetPath != null) {
// Path prefix not in trie.
return;
}

node = node.tab.get(segment);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
import javax.annotation.Nullable;

/**
* An action filesystem suitable for use when building with remote caching or execution.
* An action filesystem suitable for use when building with disk/remote caching or execution.
*
* <p>It acts as a union filesystem over three different sources:
*
Expand All @@ -91,17 +91,21 @@
* to an input. Most operations call resolveSymbolicLinks upfront (which is able to canonicalize
* paths taking every source into account) and only then delegate to the underlying sources.
*
* <p>The implementation assumes that an action never modifies its input files, but may otherwise
* modify any path in the output tree, including modifications that undo previous ones (e.g.,
* writing to a path and then moving it elsewhere). No effort is made to detect irreconcilable
* differences between sources (e.g., distinct files of the same name in two different sources).
* <p>The implementation assumes that an action never modifies its input paths, but may otherwise
* modify any path in the output tree. Concurrent operations are supported as long as they don't
* affect filesystem structure (i.e., create, move or delete paths). Otherwise, they might fail or
* produce inconsistent results. No effort is made to detect irreconcilable differences between
* sources, such as the same path existing in multiple underlying sources with different type or
* contents.
*/
public class RemoteActionFileSystem extends AbstractFileSystemWithCustomStat {
public class RemoteActionFileSystem extends AbstractFileSystemWithCustomStat
implements PathCanonicalizer.Resolver {
private final PathFragment execRoot;
private final PathFragment outputBase;
private final InputMetadataProvider fileCache;
private final ActionInputMap inputArtifactData;
private final TreeArtifactDirectoryCache inputTreeArtifactDirectoryCache;
private final PathCanonicalizer pathCanonicalizer;
private final ImmutableMap<PathFragment, Artifact> outputMapping;
private final RemoteActionInputFetcher inputFetcher;
private final FileSystem localFs;
Expand Down Expand Up @@ -241,6 +245,7 @@ public RemoteActionFileSystem(
this.outputBase = execRoot.getRelative(checkNotNull(relativeOutputPath, "relativeOutputPath"));
this.inputArtifactData = checkNotNull(inputArtifactData, "inputArtifactData");
this.inputTreeArtifactDirectoryCache = new TreeArtifactDirectoryCache();
this.pathCanonicalizer = new PathCanonicalizer(this);
this.outputMapping =
stream(outputArtifacts).collect(toImmutableMap(Artifact::getExecPath, a -> a));
this.fileCache = checkNotNull(fileCache, "fileCache");
Expand Down Expand Up @@ -310,6 +315,27 @@ public String getFileSystemType(PathFragment path) {
return "remoteActionFS";
}

@Override
protected Path resolveSymbolicLinks(PathFragment path) throws IOException {
return getPath(pathCanonicalizer.resolveSymbolicLinks(path));
}

@Override
@Nullable
public PathFragment resolveOneLink(PathFragment path) throws IOException {
// The base implementation attempts to readSymbolicLink first and falls back to stat, but that
// unnecessarily allocates a NotASymlinkException in the overwhelmingly likely non-symlink case.
// It's more efficient to stat unconditionally.
//
// The parent path has already been canonicalized, so FOLLOW_NONE is effectively the same as
// FOLLOW_PARENT, but much more efficient as it doesn't call stat recursively.
var stat = statInternal(path, FollowMode.FOLLOW_NONE, StatSources.ALL);
if (stat == null) {
throw new FileNotFoundException(path.getPathString() + " (No such file or directory)");
}
return stat.isSymbolicLink() ? readSymbolicLink(path) : null;
}

// Like resolveSymbolicLinks(), except that only the parent path is canonicalized.
private PathFragment resolveSymbolicLinksForParent(PathFragment path) throws IOException {
PathFragment parentPath = path.getParentDirectory();
Expand All @@ -328,10 +354,15 @@ protected boolean delete(PathFragment path) throws IOException {
return false;
}

// No action implementations call renameTo concurrently with other filesystem operations, so
// there's no risk of a race condition below.
pathCanonicalizer.clearPrefix(path);

boolean deleted = localFs.getPath(path).delete();
if (isOutput(path)) {
deleted = remoteOutputTree.getPath(path).delete() || deleted;
}

return deleted;
}

Expand Down Expand Up @@ -538,22 +569,6 @@ protected void createSymbolicLink(PathFragment linkPath, PathFragment targetFrag
localFs.getPath(linkPath).createSymbolicLink(targetFragment);
}

@Nullable
@Override
protected PathFragment resolveOneLink(PathFragment path) throws IOException {
// The base implementation attempts to readSymbolicLink first and falls back to stat, but that
// unnecessarily allocates a NotASymlinkException in the overwhelmingly likely non-symlink case.
// It's more efficient to stat unconditionally.
//
// The parent path has already been canonicalized, so FOLLOW_NONE is effectively the same as
// FOLLOW_PARENT, but much more efficient as it doesn't call stat recursively.
var stat = statInternal(path, FollowMode.FOLLOW_NONE, StatSources.ALL);
if (stat == null) {
throw new FileNotFoundException(path.getPathString() + " (No such file or directory)");
}
return stat.isSymbolicLink() ? readSymbolicLink(path) : null;
}

@Override
protected long getLastModifiedTime(PathFragment path, boolean followSymlinks) throws IOException {
FileStatus stat = stat(path, followSymlinks);
Expand Down Expand Up @@ -760,6 +775,11 @@ public void renameTo(PathFragment srcPath, PathFragment dstPath) throws IOExcept
checkArgument(isOutput(srcPath), "srcPath must be an output path");
checkArgument(isOutput(dstPath), "dstPath must be an output path");

// No action implementations call renameTo concurrently with other filesystem operations, so
// there's no risk of a race condition below.
pathCanonicalizer.clearPrefix(srcPath);
pathCanonicalizer.clearPrefix(dstPath);

FileNotFoundException remoteException = null;
try {
remoteOutputTree.renameTo(srcPath, dstPath);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@
@ThreadSafe
public abstract class FileSystem {

// The maximum number of symbolic links that may be traversed by resolveSymbolicLinks() while
// canonicalizing a path before it gives up and throws a FileSymlinkLoopException.
public static final int MAX_SYMLINKS = 32;

private final DigestHashFunction digestFunction;

public FileSystem(DigestHashFunction digestFunction) {
Expand Down Expand Up @@ -362,7 +366,7 @@ public InputStream openStream() throws IOException {
/**
* Appends a single regular path segment 'child' to 'dir', recursively resolving symbolic links in
* 'child'. 'dir' must be canonical. 'maxLinks' is the maximum number of symbolic links that may
* be traversed before it gives up (the Linux kernel uses 32).
* be traversed before it gives up.
*
* <p>(This method does not need to be synchronized; but the result may be stale in the case of
* concurrent modification.)
Expand Down Expand Up @@ -443,7 +447,8 @@ protected Path resolveSymbolicLinks(PathFragment path) throws IOException {
return parentNode == null
? getPath(path) // (root)
: getPath(
appendSegment(resolveSymbolicLinks(parentNode).asFragment(), path.getBaseName(), 32));
appendSegment(
resolveSymbolicLinks(parentNode).asFragment(), path.getBaseName(), MAX_SYMLINKS));
}

/**
Expand Down
1 change: 1 addition & 0 deletions src/test/java/com/google/devtools/build/lib/remote/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ java_library(
"//src/test/java/com/google/devtools/build/lib/testutil:TestUtils",
"//third_party:auth",
"//third_party:caffeine",
"//third_party:checker_framework_annotations",
"//third_party:error_prone_annotations",
"//third_party:gson",
"//third_party:guava",
Expand Down
Loading

0 comments on commit 2912882

Please sign in to comment.