Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[7.1.0] Optimize RemoteActionFileSystem#resolveSymbolicLinks by caching intermediate results in a trie. #21333

Merged
merged 1 commit into from
Feb 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
// Copyright 2024 The Bazel Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.devtools.build.lib.remote;

import static com.google.common.base.Preconditions.checkArgument;

import com.google.devtools.build.lib.vfs.FileSymlinkLoopException;
import com.google.devtools.build.lib.vfs.FileSystem;
import com.google.devtools.build.lib.vfs.PathFragment;
import java.io.IOException;
import java.util.Iterator;
import java.util.concurrent.ConcurrentHashMap;
import javax.annotation.Nullable;

/**
* Canonicalizes paths like {@link FileSystem#resolveSymbolicLinks}, while storing the intermediate
* results in a trie so they can be reused by future canonicalizations.
*
* <p>This is an implementation detail of {@link RemoteActionFileSystem}, factored out for testing.
* Because {@link RemoteActionFileSystem} implements a union filesystem and must account for the
* possibility of symlinks straddling the underlying filesystems, the performance of large
* filesystem scans can be greatly improved with a custom {@link FileSystem#resolveSymbolicLinks}
* implementation that leverages the trie to avoid repeated work.
*
* <p>On case-insensitive filesystems, accessing the same path through different case variations
* will produce distinct trie entries. This could be fixed, but it's a performance rather than a
* correctness concern, and shouldn't matter most of the time.
*
* <p>Thread-safe: concurrent calls to {@link #resolveSymbolicLinks} are supported. As with {@link
* FileSystem#resolveSymbolicLinks}, the result is undefined if the filesystem is mutated
* concurrently.
*/
final class PathCanonicalizer {

interface Resolver {
/**
* Returns the result of {@link FileSystem#readSymbolicLink} if the path is a symlink, otherwise
* null. All but the last path segment must be canonical.
*
* @throws IOException if the file type or symlink target path could not be determined
*/
@Nullable
PathFragment resolveOneLink(PathFragment path) throws IOException;
}

/** The trie node for a path segment. */
private static final class Node {
/**
* Maps the next segment to the corresponding node. Null iff the path ending with the current
* segment is a symlink.
*/
@Nullable private final ConcurrentHashMap<String, Node> tab;

/** The symlink target. Null iff the path ending with the current segment is not a symlink. */
@Nullable private final PathFragment targetPath;

private Node(@Nullable PathFragment targetPath) {
this.tab = targetPath == null ? new ConcurrentHashMap<>() : null;
this.targetPath = targetPath;
}
}

private final Resolver resolver;
private final Node root = new Node(null);

PathCanonicalizer(Resolver resolver) {
this.resolver = resolver;
}

/** Returns the root node for an absolute path. */
private Node getRootNode(PathFragment path) {
checkArgument(path.isAbsolute());
// Unix has a single root. Windows has one root per drive.
return path.getDriveStrLength() > 1
? root.tab.computeIfAbsent(path.getDriveStr(), unused -> new Node(null))
: root;
}

/**
* Canonicalizes a path, reusing cached information if possible.
*
* @param path the path to canonicalize.
* @param maxLinks the maximum number of symlinks that can be followed in the process of
* canonicalizing the path.
* @throws FileSymlinkLoopException if too many symlinks had to be followed.
* @throws IOException if an I/O error occurs
* @return the canonical path.
*/
private PathFragment resolveSymbolicLinks(PathFragment path, int maxLinks) throws IOException {
// This code is carefully written to be as fast as possible when the path is already canonical
// and has been previously cached. Avoid making changes without benchmarking. A tree artifact
// with hundreds of thousands of files makes for a good benchmark.

Node node = getRootNode(path);
Iterable<String> segments = path.segments();
int segmentIndex = 0;

// Loop invariants:
// - `segmentIndex` is the index of the current `segment` relative to the start of `path`. The
// first segment has index 0.
// - `path` is the absolute path to canonicalize. If `segmentIndex` > 0, `path` is already
// canonical up to and including `segmentIndex` - 1.
// - `node` is the trie node corresponding to the `path` prefix ending with `segmentIndex` - 1,
// or to the root path when `segmentIndex` is 0.
for (String segment : segments) {
Node nextNode = node.tab.get(segment);
if (nextNode == null) {
PathFragment naivePath = path.subFragment(0, segmentIndex + 1);
PathFragment targetPath = resolver.resolveOneLink(naivePath);
nextNode = node.tab.computeIfAbsent(segment, unused -> new Node(targetPath));
}

if (nextNode.targetPath != null) {
if (maxLinks == 0) {
throw new FileSymlinkLoopException(path);
}
maxLinks--;

// Compute the path obtained by resolving the symlink.
// Note that path normalization already handles uplevel references.
PathFragment newPath;
if (nextNode.targetPath.isAbsolute()) {
newPath = nextNode.targetPath.getRelative(path.subFragment(segmentIndex + 1));
} else {
newPath =
path.subFragment(0, segmentIndex)
.getRelative(nextNode.targetPath)
.getRelative(path.subFragment(segmentIndex + 1));
}

// For absolute symlinks, we must start over.
// For relative symlinks, it would have been possible to restart after the already
// canonicalized prefix, but they're too rare to be worth optimizing for.
return resolveSymbolicLinks(newPath, maxLinks);
}

node = nextNode;
segmentIndex++;
}

return path;
}

/**
* Canonicalizes a path, reusing cached information if possible.
*
* <p>See {@link FileSystem#resolveSymbolicLinks} for the full specification.
*
* @param path the path to canonicalize.
* @throws FileSymlinkLoopException if too many symlinks had to be followed.
* @throws IOException if an I/O error occurs
* @return the canonical path.
*/
PathFragment resolveSymbolicLinks(PathFragment path) throws IOException {
return resolveSymbolicLinks(path, FileSystem.MAX_SYMLINKS);
}

/** Removes cached information for a path prefix. */
void clearPrefix(PathFragment pathPrefix) {
Node node = getRootNode(pathPrefix);
Iterator<String> segments = pathPrefix.segments().iterator();
boolean hasNext = segments.hasNext();

while (node != null && hasNext) {
String segment = segments.next();
hasNext = segments.hasNext();

if (!hasNext) {
// Found the path prefix.
node.tab.remove(segment);
return;
}

if (node.targetPath != null) {
// Path prefix not in trie.
return;
}

node = node.tab.get(segment);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
import javax.annotation.Nullable;

/**
* An action filesystem suitable for use when building with remote caching or execution.
* An action filesystem suitable for use when building with disk/remote caching or execution.
*
* <p>It acts as a union filesystem over three different sources:
*
Expand All @@ -91,17 +91,21 @@
* to an input. Most operations call resolveSymbolicLinks upfront (which is able to canonicalize
* paths taking every source into account) and only then delegate to the underlying sources.
*
* <p>The implementation assumes that an action never modifies its input files, but may otherwise
* modify any path in the output tree, including modifications that undo previous ones (e.g.,
* writing to a path and then moving it elsewhere). No effort is made to detect irreconcilable
* differences between sources (e.g., distinct files of the same name in two different sources).
* <p>The implementation assumes that an action never modifies its input paths, but may otherwise
* modify any path in the output tree. Concurrent operations are supported as long as they don't
* affect filesystem structure (i.e., create, move or delete paths). Otherwise, they might fail or
* produce inconsistent results. No effort is made to detect irreconcilable differences between
* sources, such as the same path existing in multiple underlying sources with different type or
* contents.
*/
public class RemoteActionFileSystem extends AbstractFileSystemWithCustomStat {
public class RemoteActionFileSystem extends AbstractFileSystemWithCustomStat
implements PathCanonicalizer.Resolver {
private final PathFragment execRoot;
private final PathFragment outputBase;
private final InputMetadataProvider fileCache;
private final ActionInputMap inputArtifactData;
private final TreeArtifactDirectoryCache inputTreeArtifactDirectoryCache;
private final PathCanonicalizer pathCanonicalizer;
private final ImmutableMap<PathFragment, Artifact> outputMapping;
private final RemoteActionInputFetcher inputFetcher;
private final FileSystem localFs;
Expand Down Expand Up @@ -241,6 +245,7 @@ public RemoteActionFileSystem(
this.outputBase = execRoot.getRelative(checkNotNull(relativeOutputPath, "relativeOutputPath"));
this.inputArtifactData = checkNotNull(inputArtifactData, "inputArtifactData");
this.inputTreeArtifactDirectoryCache = new TreeArtifactDirectoryCache();
this.pathCanonicalizer = new PathCanonicalizer(this);
this.outputMapping =
stream(outputArtifacts).collect(toImmutableMap(Artifact::getExecPath, a -> a));
this.fileCache = checkNotNull(fileCache, "fileCache");
Expand Down Expand Up @@ -310,6 +315,27 @@ public String getFileSystemType(PathFragment path) {
return "remoteActionFS";
}

@Override
protected Path resolveSymbolicLinks(PathFragment path) throws IOException {
return getPath(pathCanonicalizer.resolveSymbolicLinks(path));
}

@Override
@Nullable
public PathFragment resolveOneLink(PathFragment path) throws IOException {
// The base implementation attempts to readSymbolicLink first and falls back to stat, but that
// unnecessarily allocates a NotASymlinkException in the overwhelmingly likely non-symlink case.
// It's more efficient to stat unconditionally.
//
// The parent path has already been canonicalized, so FOLLOW_NONE is effectively the same as
// FOLLOW_PARENT, but much more efficient as it doesn't call stat recursively.
var stat = statInternal(path, FollowMode.FOLLOW_NONE, StatSources.ALL);
if (stat == null) {
throw new FileNotFoundException(path.getPathString() + " (No such file or directory)");
}
return stat.isSymbolicLink() ? readSymbolicLink(path) : null;
}

// Like resolveSymbolicLinks(), except that only the parent path is canonicalized.
private PathFragment resolveSymbolicLinksForParent(PathFragment path) throws IOException {
PathFragment parentPath = path.getParentDirectory();
Expand All @@ -328,10 +354,15 @@ protected boolean delete(PathFragment path) throws IOException {
return false;
}

// No action implementations call renameTo concurrently with other filesystem operations, so
// there's no risk of a race condition below.
pathCanonicalizer.clearPrefix(path);

boolean deleted = localFs.getPath(path).delete();
if (isOutput(path)) {
deleted = remoteOutputTree.getPath(path).delete() || deleted;
}

return deleted;
}

Expand Down Expand Up @@ -538,22 +569,6 @@ protected void createSymbolicLink(PathFragment linkPath, PathFragment targetFrag
localFs.getPath(linkPath).createSymbolicLink(targetFragment);
}

@Nullable
@Override
protected PathFragment resolveOneLink(PathFragment path) throws IOException {
// The base implementation attempts to readSymbolicLink first and falls back to stat, but that
// unnecessarily allocates a NotASymlinkException in the overwhelmingly likely non-symlink case.
// It's more efficient to stat unconditionally.
//
// The parent path has already been canonicalized, so FOLLOW_NONE is effectively the same as
// FOLLOW_PARENT, but much more efficient as it doesn't call stat recursively.
var stat = statInternal(path, FollowMode.FOLLOW_NONE, StatSources.ALL);
if (stat == null) {
throw new FileNotFoundException(path.getPathString() + " (No such file or directory)");
}
return stat.isSymbolicLink() ? readSymbolicLink(path) : null;
}

@Override
protected long getLastModifiedTime(PathFragment path, boolean followSymlinks) throws IOException {
FileStatus stat = stat(path, followSymlinks);
Expand Down Expand Up @@ -760,6 +775,11 @@ public void renameTo(PathFragment srcPath, PathFragment dstPath) throws IOExcept
checkArgument(isOutput(srcPath), "srcPath must be an output path");
checkArgument(isOutput(dstPath), "dstPath must be an output path");

// No action implementations call renameTo concurrently with other filesystem operations, so
// there's no risk of a race condition below.
pathCanonicalizer.clearPrefix(srcPath);
pathCanonicalizer.clearPrefix(dstPath);

FileNotFoundException remoteException = null;
try {
remoteOutputTree.renameTo(srcPath, dstPath);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@
@ThreadSafe
public abstract class FileSystem {

// The maximum number of symbolic links that may be traversed by resolveSymbolicLinks() while
// canonicalizing a path before it gives up and throws a FileSymlinkLoopException.
public static final int MAX_SYMLINKS = 32;

private final DigestHashFunction digestFunction;

public FileSystem(DigestHashFunction digestFunction) {
Expand Down Expand Up @@ -362,7 +366,7 @@ public InputStream openStream() throws IOException {
/**
* Appends a single regular path segment 'child' to 'dir', recursively resolving symbolic links in
* 'child'. 'dir' must be canonical. 'maxLinks' is the maximum number of symbolic links that may
* be traversed before it gives up (the Linux kernel uses 32).
* be traversed before it gives up.
*
* <p>(This method does not need to be synchronized; but the result may be stale in the case of
* concurrent modification.)
Expand Down Expand Up @@ -443,7 +447,8 @@ protected Path resolveSymbolicLinks(PathFragment path) throws IOException {
return parentNode == null
? getPath(path) // (root)
: getPath(
appendSegment(resolveSymbolicLinks(parentNode).asFragment(), path.getBaseName(), 32));
appendSegment(
resolveSymbolicLinks(parentNode).asFragment(), path.getBaseName(), MAX_SYMLINKS));
}

/**
Expand Down
1 change: 1 addition & 0 deletions src/test/java/com/google/devtools/build/lib/remote/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ java_library(
"//src/test/java/com/google/devtools/build/lib/testutil:TestUtils",
"//third_party:auth",
"//third_party:caffeine",
"//third_party:checker_framework_annotations",
"//third_party:error_prone_annotations",
"//third_party:gson",
"//third_party:guava",
Expand Down
Loading
Loading