Skip to content

Commit 85e0f97

Browse files
committed
HADOOP-16848. Experimental directory marker optimization.
Optimizes directory marker operations through:- * Async create/clear * An option "fs.s3a.experimental.optimized.directory.operations" That performs whatever optimisations to directory marker IO are considered effective to reduce that I while still avoiding problems with missing markers and/or markers not deleted. 1. Work in progress 2. Experimental: use at own risk 3. Not for use against buckets whether other S3A releases are active 4. And not for use with data you value. View it as a Proof of Concept to explore what we can do here. Change-Id: Ia0dadc5e513db00c650376769e37d2ef1fab12f5
1 parent 20add89 commit 85e0f97

File tree

2 files changed

+104
-18
lines changed

2 files changed

+104
-18
lines changed

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,20 @@ private Constants() {
210210
public static final boolean EXPERIMENTAL_AWS_INTERNAL_THROTTLING_DEFAULT =
211211
true;
212212

213+
/**
214+
* Experimental/Unstable feature: should empty directory marker
215+
* operations be optimized? Value {@value}.
216+
* Default: false.
217+
*
218+
* This is an experimental feature for reducing operations related
219+
* to looking for/deleting fake directory markers.
220+
* The goals are better performance as well as fewer tombstone markers
221+
* being created on versioned buckets.
222+
*/
223+
@InterfaceStability.Unstable
224+
public static final String EXPERIMENTAL_OPTIMIZED_DIRECTORY_OPERATIONS =
225+
"fs.s3a.experimental.optimized.directory.operations";
226+
213227
// seconds until we give up trying to establish a connection to s3
214228
public static final String ESTABLISH_TIMEOUT =
215229
"fs.s3a.connection.establish.timeout";

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java

Lines changed: 90 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -172,8 +172,10 @@
172172
import static org.apache.hadoop.fs.s3a.auth.delegation.S3ADelegationTokens.TokenIssuingPolicy.NoTokensAvailable;
173173
import static org.apache.hadoop.fs.s3a.auth.delegation.S3ADelegationTokens.hasDelegationTokenBinding;
174174
import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.submit;
175+
import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.waitForCompletion;
175176
import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.waitForCompletionIgnoringExceptions;
176177
import static org.apache.hadoop.fs.s3a.impl.InternalConstants.SC_404;
178+
import static org.apache.hadoop.fs.s3a.impl.InternalConstants.X_DIRECTORY;
177179
import static org.apache.hadoop.fs.s3a.impl.NetworkBinding.fixBucketRegion;
178180
import static org.apache.hadoop.io.IOUtils.cleanupWithLogger;
179181

@@ -287,6 +289,11 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities,
287289
private final S3AFileSystem.OperationCallbacksImpl
288290
operationCallbacks = new OperationCallbacksImpl();
289291

292+
/**
293+
* Should directory marker use be optimized?
294+
*/
295+
private boolean optimizeDirectoryOperations;
296+
290297
/** Add any deprecated keys. */
291298
@SuppressWarnings("deprecation")
292299
private static void addDeprecatedKeys() {
@@ -411,7 +418,11 @@ public void initialize(URI name, Configuration originalConf)
411418

412419
// instantiate S3 Select support
413420
selectBinding = new SelectBinding(writeHelper);
414-
421+
optimizeDirectoryOperations = conf.getBoolean(
422+
EXPERIMENTAL_OPTIMIZED_DIRECTORY_OPERATIONS, false);
423+
if (optimizeDirectoryOperations) {
424+
LOG.info("Using experimental optimized directory operations");
425+
}
415426
boolean blockUploadEnabled = conf.getBoolean(FAST_UPLOAD, true);
416427

417428
if (!blockUploadEnabled) {
@@ -1495,8 +1506,21 @@ public void finishRename(final Path sourceRenamed, final Path destCreated)
14951506
Path destParent = destCreated.getParent();
14961507
if (!sourceRenamed.getParent().equals(destParent)) {
14971508
LOG.debug("source & dest parents are different; fix up dir markers");
1498-
deleteUnnecessaryFakeDirectories(destParent);
1499-
maybeCreateFakeParentDirectory(sourceRenamed);
1509+
// kick off an async delete
1510+
List<CompletableFuture<Void>> ops = new ArrayList<>(2);
1511+
ops.add(submit(
1512+
unboundedThreadPool,
1513+
() -> {
1514+
deleteUnnecessaryFakeDirectories(destParent, false);
1515+
return null;
1516+
}));
1517+
ops.add(submit(
1518+
unboundedThreadPool,
1519+
() -> {
1520+
maybeCreateFakeParentDirectory(sourceRenamed);
1521+
return null;
1522+
}));
1523+
waitForCompletion(ops);
15001524
}
15011525
}
15021526

@@ -3564,7 +3588,7 @@ void finishedWrite(String key, long length, String eTag, String versionId,
35643588
final CompletableFuture<?> deletion = submit(
35653589
unboundedThreadPool,
35663590
() -> {
3567-
deleteUnnecessaryFakeDirectories(p.getParent());
3591+
deleteUnnecessaryFakeDirectories(p.getParent(), isDir);
35683592
return null;
35693593
});
35703594
// this is only set if there is a metastore to update and the
@@ -3629,18 +3653,50 @@ void finishedWrite(String key, long length, String eTag, String versionId,
36293653
* Delete mock parent directories which are no longer needed.
36303654
* Retry policy: retrying; exceptions swallowed.
36313655
* @param path path
3656+
* @param isMkDirOperation is this for a mkdir call?
36323657
*/
36333658
@Retries.RetryExceptionsSwallowed
3634-
private void deleteUnnecessaryFakeDirectories(Path path) {
3659+
private void deleteUnnecessaryFakeDirectories(Path path,
3660+
final boolean isMkDirOperation) {
36353661
List<DeleteObjectsRequest.KeyVersion> keysToRemove = new ArrayList<>();
3636-
while (!path.isRoot()) {
3637-
String key = pathToKey(path);
3638-
key = (key.endsWith("/")) ? key : (key + "/");
3639-
LOG.trace("To delete unnecessary fake directory {} for {}", key, path);
3640-
keysToRemove.add(new DeleteObjectsRequest.KeyVersion(key));
3641-
path = path.getParent();
3662+
boolean deleteWholeTree = false;
3663+
if (optimizeDirectoryOperations && !isMkDirOperation) {
3664+
// this is a file creation/commit
3665+
// Assume that the parent directory exists either explicitly as a marker
3666+
// on implicitly (peer entries)
3667+
// only look for the dir marker in S3 -we don't care about DDB.
3668+
try {
3669+
String key = pathToKey(path);
3670+
s3GetFileStatus(path, key, StatusProbeEnum.DIR_MARKER_ONLY, null);
3671+
// here an entry exists.
3672+
LOG.debug("Removing marker {}", key);
3673+
keysToRemove.add(new DeleteObjectsRequest.KeyVersion(key));
3674+
} catch (FileNotFoundException e) {
3675+
// no entry. Nothing to delete.
3676+
} catch (IOException e) {
3677+
instrumentation.errorIgnored();
3678+
LOG.debug("Ignored when looking at directory marker {}", path, e);
3679+
// for now, fall back to a full delete.
3680+
// if the failure was permissions or network this will probably fail
3681+
// too...
3682+
deleteWholeTree = true;
3683+
}
3684+
} else {
3685+
deleteWholeTree = true;
3686+
}
3687+
if (deleteWholeTree) {
3688+
// traditional delete creates a delete request for
3689+
// all parents.
3690+
while (!path.isRoot()) {
3691+
String key = pathToKey(path);
3692+
key = (key.endsWith("/")) ? key : (key + "/");
3693+
LOG.trace("To delete unnecessary fake directory {} for {}", key, path);
3694+
keysToRemove.add(new DeleteObjectsRequest.KeyVersion(key));
3695+
path = path.getParent();
3696+
}
36423697
}
36433698
try {
3699+
// TODO: when size ==1, use DELETE instead
36443700
removeKeys(keysToRemove, true, null);
36453701
} catch(AmazonClientException | IOException e) {
36463702
instrumentation.errorIgnored();
@@ -3686,8 +3742,10 @@ public int read() throws IOException {
36863742
}
36873743
};
36883744

3745+
final ObjectMetadata metadata = newObjectMetadata(0L);
3746+
metadata.setContentType(X_DIRECTORY);
36893747
PutObjectRequest putObjectRequest = newPutObjectRequest(objectName,
3690-
newObjectMetadata(0L),
3748+
metadata,
36913749
im);
36923750
invoker.retry("PUT 0-byte object ", objectName,
36933751
true,
@@ -3803,8 +3861,7 @@ public String toString() {
38033861
if (committerIntegration != null) {
38043862
sb.append(", magicCommitter=").append(isMagicCommitEnabled());
38053863
}
3806-
sb.append(", boundedExecutor=").append(boundedThreadPool);
3807-
sb.append(", unboundedExecutor=").append(unboundedThreadPool);
3864+
sb.append(", optimizeDirMarkers=").append(optimizeDirectoryOperations);
38083865
sb.append(", credentials=").append(credentials);
38093866
sb.append(", delegation tokens=")
38103867
.append(delegationTokens.map(Objects::toString).orElse("disabled"));
@@ -3902,25 +3959,40 @@ public boolean exists(Path f) throws IOException {
39023959
}
39033960

39043961
/**
3905-
* Override superclass so as to add statistic collection.
3962+
* An optimized check which only looks for directory markers.
39063963
* {@inheritDoc}
39073964
*/
39083965
@Override
39093966
@SuppressWarnings("deprecation")
39103967
public boolean isDirectory(Path f) throws IOException {
39113968
entryPoint(INVOCATION_IS_DIRECTORY);
3912-
return super.isDirectory(f);
3969+
try {
3970+
// against S3Guard, a full query;
3971+
// against S3 a HEAD + "/" then a LIST.
3972+
return innerGetFileStatus(f, false,
3973+
StatusProbeEnum.DIRECTORIES).isDirectory();
3974+
} catch (FileNotFoundException e) {
3975+
return false;
3976+
}
39133977
}
39143978

39153979
/**
3916-
* Override superclass so as to add statistic collection.
3980+
* Override superclass so as to only poll for a file.
3981+
* Warning: may leave a 404 in the S3 load balancer cache.
39173982
* {@inheritDoc}
39183983
*/
39193984
@Override
39203985
@SuppressWarnings("deprecation")
39213986
public boolean isFile(Path f) throws IOException {
39223987
entryPoint(INVOCATION_IS_FILE);
3923-
return super.isFile(f);
3988+
try {
3989+
// against S3Guard, a full query; against S3 only a HEAD.
3990+
return innerGetFileStatus(f, false,
3991+
StatusProbeEnum.HEAD_ONLY).isFile();
3992+
} catch (FileNotFoundException e) {
3993+
// no file or there is a directory there.
3994+
return false;
3995+
}
39243996
}
39253997

39263998
/**

0 commit comments

Comments
 (0)