diff --git a/LICENSE-binary b/LICENSE-binary index ddce4209cc502..8e2c57b1032bd 100644 --- a/LICENSE-binary +++ b/LICENSE-binary @@ -216,7 +216,6 @@ com.aliyun:aliyun-java-sdk-kms:2.11.0 com.aliyun:aliyun-java-sdk-ram:3.1.0 com.aliyun:aliyun-java-sdk-sts:3.0.0 com.aliyun.oss:aliyun-sdk-oss:3.13.2 -com.amazonaws:aws-java-sdk-bundle:1.12.599 com.cedarsoftware:java-util:1.9.0 com.cedarsoftware:json-io:2.5.1 com.fasterxml.jackson.core:jackson-annotations:2.12.7 diff --git a/dev-support/bin/yetus-wrapper b/dev-support/bin/yetus-wrapper index 77cdc50a4733e..6ec28e6fede71 100755 --- a/dev-support/bin/yetus-wrapper +++ b/dev-support/bin/yetus-wrapper @@ -77,7 +77,7 @@ WANTED="$1" shift ARGV=("$@") -HADOOP_YETUS_VERSION=${HADOOP_YETUS_VERSION:-0.14.0} +HADOOP_YETUS_VERSION=${HADOOP_YETUS_VERSION:-0.14.1} BIN=$(yetus_abs "${BASH_SOURCE-$0}") BINDIR=$(dirname "${BIN}") @@ -123,7 +123,7 @@ fi ## need to DL, etc ## -BASEURL="https://archive.apache.org/dist/yetus/${HADOOP_YETUS_VERSION}/" +BASEURL="https://downloads.apache.org/yetus/${HADOOP_YETUS_VERSION}/" TARBALL="${YETUS_PREFIX}-${HADOOP_YETUS_VERSION}-bin.tar" GPGBIN=$(command -v gpg) diff --git a/dev-support/docker/Dockerfile_windows_10 b/dev-support/docker/Dockerfile_windows_10 index cde224d8a49b2..b414ad1b0d84c 100644 --- a/dev-support/docker/Dockerfile_windows_10 +++ b/dev-support/docker/Dockerfile_windows_10 @@ -61,8 +61,8 @@ RUN powershell Invoke-WebRequest -URI https://cdn.azul.com/zulu/bin/zulu8.62.0.1 RUN powershell Expand-Archive -Path $Env:TEMP\zulu8.62.0.19-ca-jdk8.0.332-win_x64.zip -DestinationPath "C:\Java" # Install Apache Maven. -RUN powershell Invoke-WebRequest -URI https://archive.apache.org/dist/maven/maven-3/3.8.6/binaries/apache-maven-3.8.6-bin.zip -OutFile $Env:TEMP\apache-maven-3.8.6-bin.zip -RUN powershell Expand-Archive -Path $Env:TEMP\apache-maven-3.8.6-bin.zip -DestinationPath "C:\Maven" +RUN powershell Invoke-WebRequest -URI https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.zip -OutFile $Env:TEMP\apache-maven-3.8.8-bin.zip +RUN powershell Expand-Archive -Path $Env:TEMP\apache-maven-3.8.8-bin.zip -DestinationPath "C:\Maven" # Install CMake 3.19.0. RUN powershell Invoke-WebRequest -URI https://cmake.org/files/v3.19/cmake-3.19.0-win64-x64.zip -OutFile $Env:TEMP\cmake-3.19.0-win64-x64.zip @@ -135,7 +135,7 @@ ENV MAVEN_OPTS '-Xmx2048M -Xss128M' ENV IS_WINDOWS 1 RUN setx PATH "%PATH%;%ALLUSERSPROFILE%\chocolatey\bin" RUN setx PATH "%PATH%;%JAVA_HOME%\bin" -RUN setx PATH "%PATH%;C:\Maven\apache-maven-3.8.6\bin" +RUN setx PATH "%PATH%;C:\Maven\apache-maven-3.8.8\bin" RUN setx PATH "%PATH%;C:\CMake\cmake-3.19.0-win64-x64\bin" RUN setx PATH "%PATH%;C:\ZStd" RUN setx PATH "%PATH%;C:\Program Files\Git\usr\bin" diff --git a/hadoop-common-project/hadoop-auth/pom.xml b/hadoop-common-project/hadoop-auth/pom.xml index 34827579ccfa1..be90bb40b19a5 100644 --- a/hadoop-common-project/hadoop-auth/pom.xml +++ b/hadoop-common-project/hadoop-auth/pom.xml @@ -136,7 +136,11 @@ org.apache.kerby - kerb-simplekdc + kerb-core + + + org.apache.kerby + kerb-util org.apache.directory.server diff --git a/hadoop-common-project/hadoop-common/src/main/bin/hadoop-functions.sh b/hadoop-common-project/hadoop-common/src/main/bin/hadoop-functions.sh index 45fba7232a099..919589cc9b289 100755 --- a/hadoop-common-project/hadoop-common/src/main/bin/hadoop-functions.sh +++ b/hadoop-common-project/hadoop-common/src/main/bin/hadoop-functions.sh @@ -1911,7 +1911,7 @@ function hadoop_start_secure_daemon if [[ ! -f "${jsvc}" ]]; then hadoop_error "JSVC_HOME is not set or set incorrectly. jsvc is required to run secure" hadoop_error "or privileged daemons. Please download and install jsvc from " - hadoop_error "http://archive.apache.org/dist/commons/daemon/binaries/ " + hadoop_error "https://downloads.apache.org/commons/daemon/binaries/ " hadoop_error "and set JSVC_HOME to the directory containing the jsvc binary." exit 1 fi diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoUtils.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoUtils.java new file mode 100644 index 0000000000000..3d118457df7cd --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoUtils.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.crypto; + +import java.security.Provider; +import java.security.Security; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.CommonConfigurationKeysPublic; +import org.apache.hadoop.fs.store.LogExactlyOnce; + +/** Utility methods for the crypto related features. */ +@InterfaceAudience.Private +public final class CryptoUtils { + static final Logger LOG = LoggerFactory.getLogger(CryptoUtils.class); + private static final LogExactlyOnce LOG_FAILED_TO_LOAD_CLASS = new LogExactlyOnce(LOG); + private static final LogExactlyOnce LOG_FAILED_TO_ADD_PROVIDER = new LogExactlyOnce(LOG); + + private static final String BOUNCY_CASTLE_PROVIDER_CLASS + = "org.bouncycastle.jce.provider.BouncyCastleProvider"; + static final String BOUNCY_CASTLE_PROVIDER_NAME = "BC"; + + /** + * Get the security provider value specified in + * {@link CommonConfigurationKeysPublic#HADOOP_SECURITY_CRYPTO_JCE_PROVIDER_KEY} + * from the given conf. + * + * @param conf the configuration + * @return the configured provider, if there is any; otherwise, return an empty string. + */ + public static String getJceProvider(Configuration conf) { + final String provider = conf.getTrimmed( + CommonConfigurationKeysPublic.HADOOP_SECURITY_CRYPTO_JCE_PROVIDER_KEY, ""); + final boolean autoAdd = conf.getBoolean( + CommonConfigurationKeysPublic.HADOOP_SECURITY_CRYPTO_JCE_PROVIDER_AUTO_ADD_KEY, + CommonConfigurationKeysPublic.HADOOP_SECURITY_CRYPTO_JCE_PROVIDER_AUTO_ADD_DEFAULT); + + // For backward compatible, auto-add BOUNCY_CASTLE_PROVIDER_CLASS when the provider is "BC". + if (autoAdd && BOUNCY_CASTLE_PROVIDER_NAME.equals(provider)) { + try { + // Use reflection in order to avoid statically loading the class. + final Class clazz = Class.forName(BOUNCY_CASTLE_PROVIDER_CLASS); + Security.addProvider((Provider) clazz.getConstructor().newInstance()); + LOG.debug("Successfully added security provider {}", provider); + if (LOG.isTraceEnabled()) { + LOG.trace("Trace", new Throwable()); + } + } catch (ClassNotFoundException e) { + LOG_FAILED_TO_LOAD_CLASS.warn("Failed to load " + BOUNCY_CASTLE_PROVIDER_CLASS, e); + } catch (Exception e) { + LOG_FAILED_TO_ADD_PROVIDER.warn("Failed to add security provider for {}", provider, e); + } + } + return provider; + } + + private CryptoUtils() { } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/JceCtrCryptoCodec.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/JceCtrCryptoCodec.java index 7aae65d47ccd2..8d15a8262e9b7 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/JceCtrCryptoCodec.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/JceCtrCryptoCodec.java @@ -17,7 +17,6 @@ */ package org.apache.hadoop.crypto; -import org.bouncycastle.jce.provider.BouncyCastleProvider; import org.apache.hadoop.util.Preconditions; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; @@ -27,13 +26,11 @@ import java.nio.ByteBuffer; import java.security.GeneralSecurityException; import java.security.SecureRandom; -import java.security.Security; import javax.crypto.Cipher; import javax.crypto.spec.IvParameterSpec; import javax.crypto.spec.SecretKeySpec; import org.slf4j.Logger; -import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_CRYPTO_JCE_PROVIDER_KEY; import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_JAVA_SECURE_RANDOM_ALGORITHM_KEY; import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_JAVA_SECURE_RANDOM_ALGORITHM_DEFAULT; @@ -48,10 +45,6 @@ public String getProvider() { return provider; } - public void setProvider(String provider) { - this.provider = provider; - } - public void calculateIV(byte[] initIV, long counter, byte[] iv, int blockSize) { Preconditions.checkArgument(initIV.length == blockSize); @@ -82,17 +75,15 @@ public Configuration getConf() { public void setConf(Configuration conf) { this.conf = conf; - setProvider(conf.get(HADOOP_SECURITY_CRYPTO_JCE_PROVIDER_KEY)); - if (BouncyCastleProvider.PROVIDER_NAME.equals(provider)) { - Security.addProvider(new BouncyCastleProvider()); - } + this.provider = CryptoUtils.getJceProvider(conf); + final String secureRandomAlg = conf.get( HADOOP_SECURITY_JAVA_SECURE_RANDOM_ALGORITHM_KEY, HADOOP_SECURITY_JAVA_SECURE_RANDOM_ALGORITHM_DEFAULT); try { - random = (provider != null) + random = (provider != null && !provider.isEmpty()) ? SecureRandom.getInstance(secureRandomAlg, provider) : SecureRandom.getInstance(secureRandomAlg); } catch(GeneralSecurityException e) { diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/KeyProvider.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/KeyProvider.java index 5e207251805fc..5cdd2f8d53f16 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/KeyProvider.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/KeyProvider.java @@ -26,7 +26,6 @@ import java.io.OutputStreamWriter; import java.nio.charset.StandardCharsets; import java.security.NoSuchAlgorithmException; -import java.security.Security; import java.util.Arrays; import java.util.Collections; import java.util.Date; @@ -35,17 +34,16 @@ import java.util.Map; import java.util.Objects; -import org.bouncycastle.jce.provider.BouncyCastleProvider; import com.google.gson.stream.JsonReader; import com.google.gson.stream.JsonWriter; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.crypto.CryptoUtils; import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import javax.crypto.KeyGenerator; -import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_CRYPTO_JCE_PROVIDER_KEY; import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_CRYPTO_JCEKS_KEY_SERIALFILTER; /** @@ -410,10 +408,7 @@ public KeyProvider(Configuration conf) { JCEKS_KEY_SERIALFILTER_DEFAULT); System.setProperty(JCEKS_KEY_SERIAL_FILTER, serialFilter); } - String jceProvider = conf.get(HADOOP_SECURITY_CRYPTO_JCE_PROVIDER_KEY); - if (BouncyCastleProvider.PROVIDER_NAME.equals(jceProvider)) { - Security.addProvider(new BouncyCastleProvider()); - } + CryptoUtils.getJceProvider(conf); } /** diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/package-info.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/package-info.java new file mode 100644 index 0000000000000..fe947dc263020 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/package-info.java @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Crypto related classes. */ +package org.apache.hadoop.crypto; diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeysPublic.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeysPublic.java index 24a3167b3db2d..d01ddd30f4705 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeysPublic.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeysPublic.java @@ -773,6 +773,9 @@ public class CommonConfigurationKeysPublic { */ public static final String HADOOP_SECURITY_CRYPTO_JCE_PROVIDER_KEY = "hadoop.security.crypto.jce.provider"; + public static final String HADOOP_SECURITY_CRYPTO_JCE_PROVIDER_AUTO_ADD_KEY = + "hadoop.security.crypto.jce.provider.auto-add"; + public static final boolean HADOOP_SECURITY_CRYPTO_JCE_PROVIDER_AUTO_ADD_DEFAULT = true; /** * @see * diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/CodecPool.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/CodecPool.java index 5b1826f9e30a8..283dcd622ffe2 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/CodecPool.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/CodecPool.java @@ -25,6 +25,7 @@ import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.util.ReflectionUtils; @@ -152,6 +153,9 @@ public static Compressor getCompressor(CompressionCodec codec, Configuration con compressor = codec.createCompressor(); LOG.info("Got brand-new compressor ["+codec.getDefaultExtension()+"]"); } else { + if (conf == null && codec instanceof Configurable) { + conf = ((Configurable)codec).getConf(); + } compressor.reinit(conf); if(LOG.isDebugEnabled()) { LOG.debug("Got recycled compressor"); diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SaslConstants.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SaslConstants.java new file mode 100644 index 0000000000000..71e4b44873820 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SaslConstants.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.security; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * SASL related constants. + */ +@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"}) +@InterfaceStability.Evolving +public class SaslConstants { + public static final Logger LOG = LoggerFactory.getLogger(SaslConstants.class); + + private static final String SASL_MECHANISM_ENV = "HADOOP_SASL_MECHANISM"; + public static final String SASL_MECHANISM; + private static final String SASL_MECHANISM_DEFAULT = "DIGEST-MD5"; + + static { + final String mechanism = System.getenv(SASL_MECHANISM_ENV); + LOG.debug("{} = {} (env)", SASL_MECHANISM_ENV, mechanism); + SASL_MECHANISM = mechanism != null? mechanism : SASL_MECHANISM_DEFAULT; + LOG.debug("{} = {} (effective)", SASL_MECHANISM_ENV, SASL_MECHANISM); + } + + private SaslConstants() {} +} \ No newline at end of file diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SaslRpcServer.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SaslRpcServer.java index b61b6cc18414d..8b81a472db93a 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SaslRpcServer.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SaslRpcServer.java @@ -223,8 +223,8 @@ public enum AuthMethod { SIMPLE((byte) 80, ""), KERBEROS((byte) 81, "GSSAPI"), @Deprecated - DIGEST((byte) 82, "DIGEST-MD5"), - TOKEN((byte) 82, "DIGEST-MD5"), + DIGEST((byte) 82, SaslConstants.SASL_MECHANISM), + TOKEN((byte) 82, SaslConstants.SASL_MECHANISM), PLAIN((byte) 83, "PLAIN"); /** The code for this method. */ @@ -273,7 +273,7 @@ public void write(DataOutput out) throws IOException { } }; - /** CallbackHandler for SASL DIGEST-MD5 mechanism */ + /** CallbackHandler for SASL mechanism. */ @InterfaceStability.Evolving public static class SaslDigestCallbackHandler implements CallbackHandler { private SecretManager secretManager; @@ -309,7 +309,7 @@ public void handle(Callback[] callbacks) throws InvalidToken, continue; // realm is ignored } else { throw new UnsupportedCallbackException(callback, - "Unrecognized SASL DIGEST-MD5 Callback"); + "Unrecognized SASL Callback"); } } if (pc != null) { @@ -319,11 +319,8 @@ public void handle(Callback[] callbacks) throws InvalidToken, UserGroupInformation user = null; user = tokenIdentifier.getUser(); // may throw exception connection.attemptingUser = user; - - if (LOG.isDebugEnabled()) { - LOG.debug("SASL server DIGEST-MD5 callback: setting password " - + "for client: " + tokenIdentifier.getUser()); - } + + LOG.debug("SASL server callback: setting password for client: {}", user); pc.setPassword(password); } if (ac != null) { @@ -339,8 +336,7 @@ public void handle(Callback[] callbacks) throws InvalidToken, UserGroupInformation logUser = getIdentifier(authzid, secretManager).getUser(); String username = logUser == null ? null : logUser.getUserName(); - LOG.debug("SASL server DIGEST-MD5 callback: setting " - + "canonicalized client ID: " + username); + LOG.debug("SASL server callback: setting authorizedID: {}", username); } ac.setAuthorizedID(authzid); } diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java index 07be1f8e54e55..305e5e10af305 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java @@ -88,6 +88,7 @@ import org.apache.hadoop.security.token.Token; import org.apache.hadoop.security.token.TokenIdentifier; import org.apache.hadoop.util.Shell; +import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Time; import org.slf4j.Logger; @@ -1934,10 +1935,7 @@ protected Subject getSubject() { @InterfaceAudience.Public @InterfaceStability.Evolving public T doAs(PrivilegedAction action) { - if (LOG.isDebugEnabled()) { - LOG.debug("PrivilegedAction [as: {}][action: {}]", this, action, - new Exception()); - } + tracePrivilegedAction(action); return Subject.doAs(subject, action); } @@ -1957,10 +1955,7 @@ public T doAs(PrivilegedAction action) { public T doAs(PrivilegedExceptionAction action ) throws IOException, InterruptedException { try { - if (LOG.isDebugEnabled()) { - LOG.debug("PrivilegedAction [as: {}][action: {}]", this, action, - new Exception()); - } + tracePrivilegedAction(action); return Subject.doAs(subject, action); } catch (PrivilegedActionException pae) { Throwable cause = pae.getCause(); @@ -1982,6 +1977,14 @@ public T doAs(PrivilegedExceptionAction action } } + private void tracePrivilegedAction(Object action) { + if (LOG.isTraceEnabled()) { + // would be nice if action included a descriptive toString() + LOG.trace("PrivilegedAction [as: {}][action: {}][from: {}]", this, action, + StringUtils.getStackTrace(new Throwable())); + } + } + /** * Log current UGI and token information into specified log. * @param ugi - UGI diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/AbstractDelegationTokenSecretManager.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/AbstractDelegationTokenSecretManager.java index 8378a47ceddfe..9cf3ccdd445e7 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/AbstractDelegationTokenSecretManager.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/AbstractDelegationTokenSecretManager.java @@ -33,6 +33,7 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.locks.ReentrantReadWriteLock; import javax.crypto.SecretKey; @@ -120,7 +121,7 @@ private String formatTokenId(TokenIdent id) { /** * Access to currentKey is protected by this object lock */ - private volatile DelegationKey currentKey; + private DelegationKey currentKey; private final long keyUpdateInterval; private final long tokenMaxLifetime; @@ -140,6 +141,8 @@ private String formatTokenId(TokenIdent id) { */ protected Object noInterruptsLock = new Object(); + private final ReentrantReadWriteLock apiLock = new ReentrantReadWriteLock(true); + /** * Create a secret manager * @param delegationKeyUpdateInterval the number of milliseconds for rolling @@ -169,21 +172,29 @@ public AbstractDelegationTokenSecretManager(long delegationKeyUpdateInterval, public void startThreads() throws IOException { Preconditions.checkState(!running); updateCurrentKey(); - synchronized (this) { + this.apiLock.writeLock().lock(); + try { running = true; tokenRemoverThread = new Daemon(new ExpiredTokenRemover()); tokenRemoverThread.start(); + } finally { + this.apiLock.writeLock().unlock(); } } /** * Reset all data structures and mutable state. */ - public synchronized void reset() { - setCurrentKeyId(0); - allKeys.clear(); - setDelegationTokenSeqNum(0); - currentTokens.clear(); + public void reset() { + this.apiLock.writeLock().lock(); + try { + setCurrentKeyId(0); + allKeys.clear(); + setDelegationTokenSeqNum(0); + currentTokens.clear(); + } finally { + this.apiLock.writeLock().unlock(); + } } /** @@ -210,17 +221,27 @@ protected long getTokenRenewInterval() { * @param key delegation key. * @throws IOException raised on errors performing I/O. */ - public synchronized void addKey(DelegationKey key) throws IOException { + public void addKey(DelegationKey key) throws IOException { if (running) // a safety check throw new IOException("Can't add delegation key to a running SecretManager."); - if (key.getKeyId() > getCurrentKeyId()) { - setCurrentKeyId(key.getKeyId()); + this.apiLock.writeLock().lock(); + try { + if (key.getKeyId() > getCurrentKeyId()) { + setCurrentKeyId(key.getKeyId()); + } + allKeys.put(key.getKeyId(), key); + } finally { + this.apiLock.writeLock().unlock(); } - allKeys.put(key.getKeyId(), key); } - public synchronized DelegationKey[] getAllKeys() { - return allKeys.values().toArray(new DelegationKey[0]); + public DelegationKey[] getAllKeys() { + this.apiLock.readLock().lock(); + try { + return allKeys.values().toArray(new DelegationKey[0]); + } finally { + this.apiLock.readLock().unlock(); + } } // HDFS @@ -263,8 +284,13 @@ protected void updateStoredToken(TokenIdent ident, long renewDate) throws IOExce * * @return currentId. */ - protected synchronized int getCurrentKeyId() { - return currentId; + protected int getCurrentKeyId() { + this.apiLock.readLock().lock(); + try { + return currentId; + } finally { + this.apiLock.readLock().unlock(); + } } /** @@ -273,8 +299,13 @@ protected synchronized int getCurrentKeyId() { * * @return currentId. */ - protected synchronized int incrementCurrentKeyId() { - return ++currentId; + protected int incrementCurrentKeyId() { + this.apiLock.writeLock().lock(); + try { + return ++currentId; + } finally { + this.apiLock.writeLock().unlock(); + } } /** @@ -283,8 +314,13 @@ protected synchronized int incrementCurrentKeyId() { * * @param keyId keyId. */ - protected synchronized void setCurrentKeyId(int keyId) { - currentId = keyId; + protected void setCurrentKeyId(int keyId) { + this.apiLock.writeLock().lock(); + try { + currentId = keyId; + } finally { + this.apiLock.writeLock().unlock(); + } } /** @@ -293,8 +329,13 @@ protected synchronized void setCurrentKeyId(int keyId) { * * @return delegationTokenSequenceNumber. */ - protected synchronized int getDelegationTokenSeqNum() { - return delegationTokenSequenceNumber; + protected int getDelegationTokenSeqNum() { + this.apiLock.readLock().lock(); + try { + return delegationTokenSequenceNumber; + } finally { + this.apiLock.readLock().unlock(); + } } /** @@ -303,8 +344,13 @@ protected synchronized int getDelegationTokenSeqNum() { * * @return delegationTokenSequenceNumber. */ - protected synchronized int incrementDelegationTokenSeqNum() { - return ++delegationTokenSequenceNumber; + protected int incrementDelegationTokenSeqNum() { + this.apiLock.writeLock().lock(); + try { + return ++delegationTokenSequenceNumber; + } finally { + this.apiLock.writeLock().unlock(); + } } /** @@ -313,8 +359,13 @@ protected synchronized int incrementDelegationTokenSeqNum() { * * @param seqNum seqNum. */ - protected synchronized void setDelegationTokenSeqNum(int seqNum) { - delegationTokenSequenceNumber = seqNum; + protected void setDelegationTokenSeqNum(int seqNum) { + this.apiLock.writeLock().lock(); + try { + delegationTokenSequenceNumber = seqNum; + } finally { + this.apiLock.writeLock().unlock(); + } } /** @@ -401,34 +452,39 @@ protected void updateToken(TokenIdent ident, * @param renewDate token renew time * @throws IOException raised on errors performing I/O. */ - public synchronized void addPersistedDelegationToken( + public void addPersistedDelegationToken( TokenIdent identifier, long renewDate) throws IOException { if (running) { // a safety check throw new IOException( "Can't add persisted delegation token to a running SecretManager."); } - int keyId = identifier.getMasterKeyId(); - DelegationKey dKey = allKeys.get(keyId); - byte[] password = null; - if (dKey == null) { - LOG.warn("No KEY found for persisted identifier, expiring stored token " - + formatTokenId(identifier)); - // make sure the token is expired - renewDate = 0L; - } else { - password = createPassword(identifier.getBytes(), dKey.getKey()); - } - if (identifier.getSequenceNumber() > getDelegationTokenSeqNum()) { - setDelegationTokenSeqNum(identifier.getSequenceNumber()); - } - if (getTokenInfo(identifier) == null) { - currentTokens.put(identifier, new DelegationTokenInformation(renewDate, - password, getTrackingIdIfEnabled(identifier))); - addTokenForOwnerStats(identifier); - } else { - throw new IOException("Same delegation token being added twice: " - + formatTokenId(identifier)); + this.apiLock.writeLock().lock(); + try { + int keyId = identifier.getMasterKeyId(); + DelegationKey dKey = allKeys.get(keyId); + byte[] password = null; + if (dKey == null) { + LOG.warn("No KEY found for persisted identifier, expiring stored token " + formatTokenId( + identifier)); + // make sure the token is expired + renewDate = 0L; + } else { + password = createPassword(identifier.getBytes(), dKey.getKey()); + } + if (identifier.getSequenceNumber() > getDelegationTokenSeqNum()) { + setDelegationTokenSeqNum(identifier.getSequenceNumber()); + } + if (getTokenInfo(identifier) == null) { + currentTokens.put(identifier, new DelegationTokenInformation(renewDate, password, + getTrackingIdIfEnabled(identifier))); + addTokenForOwnerStats(identifier); + } else { + throw new IOException("Same delegation token being added twice: " + + formatTokenId(identifier)); + } + } finally { + this.apiLock.writeLock().unlock(); } } @@ -441,17 +497,18 @@ private void updateCurrentKey() throws IOException { LOG.info("Updating the current master key for generating delegation tokens"); /* Create a new currentKey with an estimated expiry date. */ int newCurrentId; - synchronized (this) { - newCurrentId = incrementCurrentKeyId(); - } + newCurrentId = incrementCurrentKeyId(); DelegationKey newKey = new DelegationKey(newCurrentId, System .currentTimeMillis() + keyUpdateInterval + tokenMaxLifetime, generateSecret()); //Log must be invoked outside the lock on 'this' logUpdateMasterKey(newKey); - synchronized (this) { + this.apiLock.writeLock().lock(); + try { currentKey = newKey; storeDelegationKey(currentKey); + } finally { + this.apiLock.writeLock().unlock(); } } @@ -461,7 +518,8 @@ private void updateCurrentKey() throws IOException { * @throws IOException raised on errors performing I/O. */ protected void rollMasterKey() throws IOException { - synchronized (this) { + this.apiLock.writeLock().lock(); + try { removeExpiredKeys(); /* set final expiry date for retiring currentKey */ currentKey.setExpiryDate(Time.now() + tokenMaxLifetime); @@ -471,47 +529,59 @@ protected void rollMasterKey() throws IOException { * allKeys just in case. */ updateDelegationKey(currentKey); + } finally { + this.apiLock.writeLock().unlock(); } updateCurrentKey(); } - private synchronized void removeExpiredKeys() { - long now = Time.now(); - for (Iterator> it = allKeys.entrySet() - .iterator(); it.hasNext();) { - Map.Entry e = it.next(); - if (e.getValue().getExpiryDate() < now) { - it.remove(); - // ensure the tokens generated by this current key can be recovered - // with this current key after this current key is rolled - if(!e.getValue().equals(currentKey)) - removeStoredMasterKey(e.getValue()); + private void removeExpiredKeys() { + this.apiLock.writeLock().lock(); + try { + long now = Time.now(); + for (Iterator> it = + allKeys.entrySet().iterator(); it.hasNext();) { + Map.Entry e = it.next(); + if (e.getValue().getExpiryDate() < now) { + it.remove(); + // ensure the tokens generated by this current key can be recovered + // with this current key after this current key is rolled + if (!e.getValue().equals(currentKey)) { + removeStoredMasterKey(e.getValue()); + } + } } + } finally { + this.apiLock.writeLock().unlock(); } } @Override protected byte[] createPassword(TokenIdent identifier) { - int sequenceNum; - long now = Time.now(); - sequenceNum = incrementDelegationTokenSeqNum(); - identifier.setIssueDate(now); - identifier.setMaxDate(now + tokenMaxLifetime); - DelegationKey delegationCurrentKey = currentKey; - identifier.setMasterKeyId(delegationCurrentKey.getKeyId()); - identifier.setSequenceNumber(sequenceNum); - LOG.info("Creating password for identifier: " + formatTokenId(identifier) - + ", currentKey: " + delegationCurrentKey.getKeyId()); - byte[] password = createPassword(identifier.getBytes(), delegationCurrentKey.getKey()); - DelegationTokenInformation tokenInfo = new DelegationTokenInformation(now - + tokenRenewInterval, password, getTrackingIdIfEnabled(identifier)); + this.apiLock.writeLock().lock(); try { - METRICS.trackStoreToken(() -> storeToken(identifier, tokenInfo)); - } catch (IOException ioe) { - LOG.error("Could not store token " + formatTokenId(identifier) + "!!", - ioe); + int sequenceNum; + long now = Time.now(); + sequenceNum = incrementDelegationTokenSeqNum(); + identifier.setIssueDate(now); + identifier.setMaxDate(now + tokenMaxLifetime); + identifier.setMasterKeyId(currentKey.getKeyId()); + identifier.setSequenceNumber(sequenceNum); + LOG.info("Creating password for identifier: " + formatTokenId(identifier) + + ", currentKey: " + currentKey.getKeyId()); + byte[] password = createPassword(identifier.getBytes(), currentKey.getKey()); + DelegationTokenInformation tokenInfo = + new DelegationTokenInformation(now + tokenRenewInterval, password, + getTrackingIdIfEnabled(identifier)); + try { + METRICS.trackStoreToken(() -> storeToken(identifier, tokenInfo)); + } catch (IOException ioe) { + LOG.error("Could not store token " + formatTokenId(identifier) + "!!", ioe); + } + return password; + } finally { + this.apiLock.writeLock().unlock(); } - return password; } @@ -548,7 +618,12 @@ protected DelegationTokenInformation checkToken(TokenIdent identifier) @Override public byte[] retrievePassword(TokenIdent identifier) throws InvalidToken { - return checkToken(identifier).getPassword(); + this.apiLock.readLock().lock(); + try { + return checkToken(identifier).getPassword(); + } finally { + this.apiLock.readLock().unlock(); + } } protected String getTrackingIdIfEnabled(TokenIdent ident) { @@ -559,11 +634,16 @@ protected String getTrackingIdIfEnabled(TokenIdent ident) { } public String getTokenTrackingId(TokenIdent identifier) { - DelegationTokenInformation info = getTokenInfo(identifier); - if (info == null) { - return null; + this.apiLock.readLock().lock(); + try { + DelegationTokenInformation info = getTokenInfo(identifier); + if (info == null) { + return null; + } + return info.getTrackingId(); + } finally { + this.apiLock.readLock().unlock(); } - return info.getTrackingId(); } /** @@ -574,10 +654,15 @@ public String getTokenTrackingId(TokenIdent identifier) { */ public void verifyToken(TokenIdent identifier, byte[] password) throws InvalidToken { - byte[] storedPassword = retrievePassword(identifier); - if (!MessageDigest.isEqual(password, storedPassword)) { - throw new InvalidToken("token " + formatTokenId(identifier) - + " is invalid, password doesn't match"); + this.apiLock.readLock().lock(); + try { + byte[] storedPassword = retrievePassword(identifier); + if (!MessageDigest.isEqual(password, storedPassword)) { + throw new InvalidToken("token " + formatTokenId(identifier) + + " is invalid, password doesn't match"); + } + } finally { + this.apiLock.readLock().unlock(); } } @@ -591,55 +676,53 @@ public void verifyToken(TokenIdent identifier, byte[] password) */ public long renewToken(Token token, String renewer) throws InvalidToken, IOException { - ByteArrayInputStream buf = new ByteArrayInputStream(token.getIdentifier()); - DataInputStream in = new DataInputStream(buf); - TokenIdent id = createIdentifier(); - id.readFields(in); - LOG.info("Token renewal for identifier: " + formatTokenId(id) - + "; total currentTokens " + currentTokens.size()); + this.apiLock.writeLock().lock(); + try { + ByteArrayInputStream buf = new ByteArrayInputStream(token.getIdentifier()); + DataInputStream in = new DataInputStream(buf); + TokenIdent id = createIdentifier(); + id.readFields(in); + LOG.info("Token renewal for identifier: " + formatTokenId(id) + "; total currentTokens " + + currentTokens.size()); + + long now = Time.now(); + if (id.getMaxDate() < now) { + throw new InvalidToken(renewer + " tried to renew an expired token " + formatTokenId(id) + + " max expiration date: " + Time.formatTime(id.getMaxDate()) + " currentTime: " + + Time.formatTime(now)); + } + if ((id.getRenewer() == null) || (id.getRenewer().toString().isEmpty())) { + throw new AccessControlException(renewer + " tried to renew a token " + formatTokenId(id) + + " without a renewer"); + } + if (!id.getRenewer().toString().equals(renewer)) { + throw new AccessControlException(renewer + " tries to renew a token " + formatTokenId(id) + + " with non-matching renewer " + id.getRenewer()); + } + DelegationKey key = getDelegationKey(id.getMasterKeyId()); + if (key == null) { + throw new InvalidToken("Unable to find master key for keyId=" + id.getMasterKeyId() + + " from cache. Failed to renew an unexpired token " + formatTokenId(id) + + " with sequenceNumber=" + id.getSequenceNumber()); + } + byte[] password = createPassword(token.getIdentifier(), key.getKey()); + if (!MessageDigest.isEqual(password, token.getPassword())) { + throw new AccessControlException( + renewer + " is trying to renew a token " + formatTokenId(id) + " with wrong password"); + } + long renewTime = Math.min(id.getMaxDate(), now + tokenRenewInterval); + String trackingId = getTrackingIdIfEnabled(id); + DelegationTokenInformation info = + new DelegationTokenInformation(renewTime, password, trackingId); - long now = Time.now(); - if (id.getMaxDate() < now) { - throw new InvalidToken(renewer + " tried to renew an expired token " - + formatTokenId(id) + " max expiration date: " - + Time.formatTime(id.getMaxDate()) - + " currentTime: " + Time.formatTime(now)); - } - if ((id.getRenewer() == null) || (id.getRenewer().toString().isEmpty())) { - throw new AccessControlException(renewer + - " tried to renew a token " + formatTokenId(id) - + " without a renewer"); - } - if (!id.getRenewer().toString().equals(renewer)) { - throw new AccessControlException(renewer - + " tries to renew a token " + formatTokenId(id) - + " with non-matching renewer " + id.getRenewer()); - } - DelegationKey key = getDelegationKey(id.getMasterKeyId()); - if (key == null) { - throw new InvalidToken("Unable to find master key for keyId=" - + id.getMasterKeyId() - + " from cache. Failed to renew an unexpired token " - + formatTokenId(id) + " with sequenceNumber=" - + id.getSequenceNumber()); - } - byte[] password = createPassword(token.getIdentifier(), key.getKey()); - if (!MessageDigest.isEqual(password, token.getPassword())) { - throw new AccessControlException(renewer - + " is trying to renew a token " - + formatTokenId(id) + " with wrong password"); - } - long renewTime = Math.min(id.getMaxDate(), now + tokenRenewInterval); - String trackingId = getTrackingIdIfEnabled(id); - DelegationTokenInformation info = new DelegationTokenInformation(renewTime, - password, trackingId); - - if (getTokenInfo(id) == null) { - throw new InvalidToken("Renewal request for unknown token " - + formatTokenId(id)); + if (getTokenInfo(id) == null) { + throw new InvalidToken("Renewal request for unknown token " + formatTokenId(id)); + } + METRICS.trackUpdateToken(() -> updateToken(id, info)); + return renewTime; + } finally { + this.apiLock.writeLock().unlock(); } - METRICS.trackUpdateToken(() -> updateToken(id, info)); - return renewTime; } /** @@ -653,35 +736,39 @@ public long renewToken(Token token, */ public TokenIdent cancelToken(Token token, String canceller) throws IOException { - ByteArrayInputStream buf = new ByteArrayInputStream(token.getIdentifier()); - DataInputStream in = new DataInputStream(buf); - TokenIdent id = createIdentifier(); - id.readFields(in); - LOG.info("Token cancellation requested for identifier: " - + formatTokenId(id)); - - if (id.getUser() == null) { - throw new InvalidToken("Token with no owner " + formatTokenId(id)); - } - String owner = id.getUser().getUserName(); - Text renewer = id.getRenewer(); - HadoopKerberosName cancelerKrbName = new HadoopKerberosName(canceller); - String cancelerShortName = cancelerKrbName.getShortName(); - if (!canceller.equals(owner) - && (renewer == null || renewer.toString().isEmpty() || !cancelerShortName - .equals(renewer.toString()))) { - throw new AccessControlException(canceller - + " is not authorized to cancel the token " + formatTokenId(id)); - } - DelegationTokenInformation info = currentTokens.remove(id); - if (info == null) { - throw new InvalidToken("Token not found " + formatTokenId(id)); + this.apiLock.writeLock().lock(); + try { + ByteArrayInputStream buf = new ByteArrayInputStream(token.getIdentifier()); + DataInputStream in = new DataInputStream(buf); + TokenIdent id = createIdentifier(); + id.readFields(in); + LOG.info("Token cancellation requested for identifier: " + formatTokenId(id)); + + if (id.getUser() == null) { + throw new InvalidToken("Token with no owner " + formatTokenId(id)); + } + String owner = id.getUser().getUserName(); + Text renewer = id.getRenewer(); + HadoopKerberosName cancelerKrbName = new HadoopKerberosName(canceller); + String cancelerShortName = cancelerKrbName.getShortName(); + if (!canceller.equals(owner) && + (renewer == null || renewer.toString().isEmpty() || + !cancelerShortName.equals(renewer.toString()))) { + throw new AccessControlException(canceller + " is not authorized to cancel the token " + + formatTokenId(id)); + } + DelegationTokenInformation info = currentTokens.remove(id); + if (info == null) { + throw new InvalidToken("Token not found " + formatTokenId(id)); + } + METRICS.trackRemoveToken(() -> { + removeTokenForOwnerStats(id); + removeStoredToken(id); + }); + return id; + } finally { + this.apiLock.writeLock().unlock(); } - METRICS.trackRemoveToken(() -> { - removeTokenForOwnerStats(id); - removeStoredToken(id); - }); - return id; } /** @@ -762,7 +849,8 @@ public void readFields(DataInput in) throws IOException { private void removeExpiredToken() throws IOException { long now = Time.now(); Set expiredTokens = new HashSet<>(); - synchronized (this) { + this.apiLock.writeLock().lock(); + try { Iterator> i = getCandidateTokensForCleanup().entrySet().iterator(); while (i.hasNext()) { @@ -774,6 +862,8 @@ private void removeExpiredToken() throws IOException { i.remove(); } } + } finally { + this.apiLock.writeLock().unlock(); } // don't hold lock on 'this' to avoid edit log updates blocking token ops logExpireTokens(expiredTokens); @@ -818,10 +908,10 @@ public void stopThreads() { * is secretMgr running * @return true if secret mgr is running */ - public synchronized boolean isRunning() { + public boolean isRunning() { return running; } - + private class ExpiredTokenRemover extends Thread { private long lastMasterKeyUpdate; private long lastTokenCacheCleanup; diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/ZKDelegationTokenSecretManager.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/ZKDelegationTokenSecretManager.java index 0642d3d581066..dd2ab3ff26f76 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/ZKDelegationTokenSecretManager.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/ZKDelegationTokenSecretManager.java @@ -25,7 +25,6 @@ import java.io.IOException; import java.io.UncheckedIOException; import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.locks.ReentrantLock; import java.util.stream.Stream; import org.apache.curator.RetryPolicy; @@ -153,7 +152,7 @@ protected static CuratorFramework getCurator() { private final int seqNumBatchSize; private int currentSeqNum; private int currentMaxSeqNum; - private final ReentrantLock currentSeqNumLock; + private final boolean isTokenWatcherEnabled; public ZKDelegationTokenSecretManager(Configuration conf) { @@ -169,8 +168,7 @@ public ZKDelegationTokenSecretManager(Configuration conf) { ZK_DTSM_TOKEN_SEQNUM_BATCH_SIZE_DEFAULT); isTokenWatcherEnabled = conf.getBoolean(ZK_DTSM_TOKEN_WATCHER_ENABLED, ZK_DTSM_TOKEN_WATCHER_ENABLED_DEFAULT); - this.currentSeqNumLock = new ReentrantLock(true); - + String workPath = conf.get(ZK_DTSM_ZNODE_WORKING_PATH, ZK_DTSM_ZNODE_WORKING_PATH_DEAFULT); String nameSpace = workPath + "/" + ZK_DTSM_NAMESPACE; if (CURATOR_TL.get() != null) { @@ -506,28 +504,24 @@ protected int incrementDelegationTokenSeqNum() { // The secret manager will keep a local range of seq num which won't be // seen by peers, so only when the range is exhausted it will ask zk for // another range again - try { - this.currentSeqNumLock.lock(); - if (currentSeqNum >= currentMaxSeqNum) { - try { - // after a successful batch request, we can get the range starting point - currentSeqNum = incrSharedCount(delTokSeqCounter, seqNumBatchSize); - currentMaxSeqNum = currentSeqNum + seqNumBatchSize; - LOG.info("Fetched new range of seq num, from {} to {} ", - currentSeqNum+1, currentMaxSeqNum); - } catch (InterruptedException e) { - // The ExpirationThread is just finishing.. so dont do anything.. - LOG.debug( - "Thread interrupted while performing token counter increment", e); - Thread.currentThread().interrupt(); - } catch (Exception e) { - throw new RuntimeException("Could not increment shared counter !!", e); - } + if (currentSeqNum >= currentMaxSeqNum) { + try { + // after a successful batch request, we can get the range starting point + currentSeqNum = incrSharedCount(delTokSeqCounter, seqNumBatchSize); + currentMaxSeqNum = currentSeqNum + seqNumBatchSize; + LOG.info("Fetched new range of seq num, from {} to {} ", + currentSeqNum+1, currentMaxSeqNum); + } catch (InterruptedException e) { + // The ExpirationThread is just finishing.. so dont do anything.. + LOG.debug( + "Thread interrupted while performing token counter increment", e); + Thread.currentThread().interrupt(); + } catch (Exception e) { + throw new RuntimeException("Could not increment shared counter !!", e); } - return ++currentSeqNum; - } finally { - this.currentSeqNumLock.unlock(); } + + return ++currentSeqNum; } @Override diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/StringUtils.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/StringUtils.java index 2585729950b55..b109d8bacb0cd 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/StringUtils.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/StringUtils.java @@ -1148,6 +1148,19 @@ public static String getStackTrace(Thread t) { return str.toString(); } + /** + * Get stack trace from throwable exception. + * @param t Throwable. + * @return stack trace string. + */ + public static String getStackTrace(Throwable t) { + StringBuilder str = new StringBuilder(); + for (StackTraceElement e : t.getStackTrace()) { + str.append(e.toString() + "\n\t"); + } + return str.toString(); + } + /** * From a list of command-line arguments, remove both an option and the * next argument. diff --git a/hadoop-common-project/hadoop-common/src/main/native/src/exception.c b/hadoop-common-project/hadoop-common/src/main/native/src/exception.c index a25cc3d3b7eef..b4a9b81280392 100644 --- a/hadoop-common-project/hadoop-common/src/main/native/src/exception.c +++ b/hadoop-common-project/hadoop-common/src/main/native/src/exception.c @@ -110,9 +110,16 @@ jthrowable newIOException(JNIEnv* env, const char *fmt, ...) const char* terror(int errnum) { - -#if defined(__sun) || defined(__GLIBC_PREREQ) && __GLIBC_PREREQ(2, 32) // MT-Safe under Solaris or glibc >= 2.32 not supporting sys_errlist/sys_nerr +#if defined(__sun) + #define USE_STR_ERROR +#elif defined(__GLIBC_PREREQ) + #if __GLIBC_PREREQ(2, 32) + #define USE_STR_ERROR + #endif +#endif + +#if defined(USE_STR_ERROR) return strerror(errnum); #else if ((errnum < 0) || (errnum >= sys_nerr)) { @@ -121,4 +128,3 @@ const char* terror(int errnum) return sys_errlist[errnum]; #endif } - diff --git a/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml b/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml index 075c7e02e8111..bd91de0f080fd 100644 --- a/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml +++ b/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml @@ -3623,6 +3623,21 @@ The switch to turn S3A auditing on or off. The JCE provider name used in CryptoCodec. + If this value is set, the corresponding provider must be added to the provider list. + The provider may be added statically in the java.security file, or + dynamically by calling the java.security.Security.addProvider(..) method, or + automatically (only for org.bouncycastle.jce.provider.BouncyCastleProvider) + by setting "hadoop.security.crypto.jce.provider.auto-add" to true + + + + + hadoop.security.crypto.jce.provider.auto-add + true + + Automatically add the org.bouncycastle.jce.provider.BouncyCastleProvider + when the value in "hadoop.security.crypto.jce.provider" is set + to BouncyCastleProvider.PROVIDER_NAME. diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md index aaead837102e7..a89d254d937c0 100644 --- a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md +++ b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md @@ -532,6 +532,7 @@ Each metrics record contains tags such as SessionId and Hostname as additional i | `NumProcessedCommands` | Num of processed commands of all BPServiceActors | | `ProcessedCommandsOpNumOps` | Total number of processed commands operations | | `ProcessedCommandsOpAvgTime` | Average time of processed commands operations in milliseconds | +| `NullStorageBlockReports` | Number of blocks in IBRs that failed due to null storage | FsVolume -------- diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/TestCryptoUtils.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/TestCryptoUtils.java new file mode 100644 index 0000000000000..be3695472409c --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/TestCryptoUtils.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.crypto; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.test.GenericTestUtils; +import org.assertj.core.api.Assertions; +import org.bouncycastle.jce.provider.BouncyCastleProvider; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.event.Level; + +import java.security.Provider; +import java.security.Security; + +import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_CRYPTO_JCE_PROVIDER_AUTO_ADD_DEFAULT; +import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_CRYPTO_JCE_PROVIDER_AUTO_ADD_KEY; +import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_CRYPTO_JCE_PROVIDER_KEY; + +/** Test {@link CryptoUtils}. */ +public class TestCryptoUtils { + static { + GenericTestUtils.setLogLevel(CryptoUtils.LOG, Level.TRACE); + } + + @Test(timeout = 1_000) + public void testProviderName() { + Assert.assertEquals(CryptoUtils.BOUNCY_CASTLE_PROVIDER_NAME, BouncyCastleProvider.PROVIDER_NAME); + } + + static void assertRemoveProvider() { + Security.removeProvider(BouncyCastleProvider.PROVIDER_NAME); + Assert.assertNull(Security.getProvider(BouncyCastleProvider.PROVIDER_NAME)); + } + + static void assertSetProvider(Configuration conf) { + conf.set(HADOOP_SECURITY_CRYPTO_JCE_PROVIDER_KEY, CryptoUtils.BOUNCY_CASTLE_PROVIDER_NAME); + final String providerFromConf = CryptoUtils.getJceProvider(conf); + Assert.assertEquals(CryptoUtils.BOUNCY_CASTLE_PROVIDER_NAME, providerFromConf); + } + + @Test(timeout = 5_000) + public void testAutoAddDisabled() { + assertRemoveProvider(); + + final Configuration conf = new Configuration(); + conf.setBoolean(HADOOP_SECURITY_CRYPTO_JCE_PROVIDER_AUTO_ADD_KEY, false); + + assertSetProvider(conf); + + Assert.assertNull(Security.getProvider(BouncyCastleProvider.PROVIDER_NAME)); + } + + @Test(timeout = 5_000) + public void testAutoAddEnabled() { + assertRemoveProvider(); + + final Configuration conf = new Configuration(); + Assertions.assertThat(conf.get(HADOOP_SECURITY_CRYPTO_JCE_PROVIDER_AUTO_ADD_KEY)) + .describedAs("conf: " + HADOOP_SECURITY_CRYPTO_JCE_PROVIDER_AUTO_ADD_KEY) + .isEqualToIgnoringCase("true"); + Assert.assertTrue(HADOOP_SECURITY_CRYPTO_JCE_PROVIDER_AUTO_ADD_DEFAULT); + + conf.set(HADOOP_SECURITY_CRYPTO_JCE_PROVIDER_KEY, CryptoUtils.BOUNCY_CASTLE_PROVIDER_NAME); + final String providerFromConf = CryptoUtils.getJceProvider(conf); + Assert.assertEquals(CryptoUtils.BOUNCY_CASTLE_PROVIDER_NAME, providerFromConf); + + final Provider provider = Security.getProvider(BouncyCastleProvider.PROVIDER_NAME); + Assertions.assertThat(provider) + .isInstanceOf(BouncyCastleProvider.class); + + assertRemoveProvider(); + } +} diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/compress/TestCodecPool.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/compress/TestCodecPool.java index 4b18ee6047ba4..ac6aff7427e4a 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/compress/TestCodecPool.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/compress/TestCodecPool.java @@ -22,6 +22,8 @@ import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.OutputStream; +import java.lang.reflect.Field; +import java.util.List; import java.util.Random; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; @@ -32,7 +34,10 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.compress.zlib.BuiltInGzipCompressor; import org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor; +import org.apache.hadoop.io.compress.zlib.ZlibCompressor.CompressionLevel; +import org.apache.hadoop.io.compress.zlib.ZlibFactory; import org.apache.hadoop.test.LambdaTestUtils; +import org.apache.hadoop.util.ReflectionUtils; import org.junit.Before; import org.junit.Test; @@ -86,6 +91,36 @@ public void testCompressorNotReturnSameInstance() { } } + @Test(timeout = 10000) + public void testCompressorConf() throws Exception { + DefaultCodec codec1 = new DefaultCodec(); + Configuration conf = new Configuration(); + ZlibFactory.setCompressionLevel(conf, CompressionLevel.TWO); + codec1.setConf(conf); + Compressor comp1 = CodecPool.getCompressor(codec1); + CodecPool.returnCompressor(comp1); + + DefaultCodec codec2 = new DefaultCodec(); + Configuration conf2 = new Configuration(); + CompressionLevel newCompressionLevel = CompressionLevel.THREE; + ZlibFactory.setCompressionLevel(conf2, newCompressionLevel); + codec2.setConf(conf2); + Compressor comp2 = CodecPool.getCompressor(codec2); + List fields = ReflectionUtils.getDeclaredFieldsIncludingInherited(comp2.getClass()); + for (Field field : fields) { + if (field.getName().equals("level")) { + field.setAccessible(true); + Object levelValue = field.get(comp2); + if (levelValue instanceof CompressionLevel) { + assertEquals(newCompressionLevel, levelValue); + } else { + assertEquals(3, levelValue); + } + } + } + CodecPool.returnCompressor(comp2); + } + @Test(timeout = 10000) public void testDecompressorPoolCounts() { // Get two decompressors and return them diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestSaslRPC.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestSaslRPC.java index 662faea599648..9107809dad1d5 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestSaslRPC.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestSaslRPC.java @@ -536,7 +536,7 @@ public void handle(Callback[] callbacks) private static Pattern BadToken = Pattern.compile("^" + RemoteException.class.getName() + "\\("+ SaslException.class.getName() + "\\): " + - "DIGEST-MD5: digest response format violation.*"); + SaslConstants.SASL_MECHANISM + ": digest response format violation.*"); private static Pattern KrbFailed = Pattern.compile(".*Failed on local exception:.* " + "Failed to specify server's Kerberos principal name.*"); diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestStringUtils.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestStringUtils.java index c9b42b07f4c95..e6dcc5e8de0f0 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestStringUtils.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestStringUtils.java @@ -624,6 +624,15 @@ public void testStringCollectionSplitByEqualsFailure() throws Exception { () -> StringUtils.getTrimmedStringCollectionSplitByEquals(",=")); } + @Test + public void testForGetStackTrace() { + Throwable throwable = new Throwable(); + int stackLength = throwable.getStackTrace().length; + String stackTrace = StringUtils.getStackTrace(new Throwable()); + String[] splitTrace = stackTrace.split("\n\t"); + assertEquals(stackLength, splitTrace.length); + } + // Benchmark for StringUtils split public static void main(String []args) { final String TO_SPLIT = "foo,bar,baz,blah,blah"; diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/client/impl/metrics/BlockReaderIoProvider.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/client/impl/metrics/BlockReaderIoProvider.java index 0792db80b6238..7e535a071dfa3 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/client/impl/metrics/BlockReaderIoProvider.java +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/client/impl/metrics/BlockReaderIoProvider.java @@ -65,7 +65,7 @@ public BlockReaderIoProvider(@Nullable ShortCircuitConf conf, public int read(FileChannel dataIn, ByteBuffer dst, long position) throws IOException{ final int nRead; - if (isEnabled && (ThreadLocalRandom.current().nextInt() < sampleRangeMax)) { + if (isEnabled && (ThreadLocalRandom.current().nextInt(Integer.MAX_VALUE) < sampleRangeMax)) { long begin = timer.monotonicNow(); nRead = dataIn.read(dst, position); long latency = timer.monotonicNow() - begin; diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/sasl/SaslParticipant.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/sasl/SaslParticipant.java index f51f458fb2bdf..e32f76a8ebd7d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/sasl/SaslParticipant.java +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/sasl/SaslParticipant.java @@ -32,6 +32,7 @@ import org.apache.hadoop.hdfs.protocol.datatransfer.IOStreamPair; import org.apache.hadoop.security.FastSaslClientFactory; import org.apache.hadoop.security.FastSaslServerFactory; +import org.apache.hadoop.security.SaslConstants; import org.apache.hadoop.security.SaslInputStream; import org.apache.hadoop.security.SaslOutputStream; @@ -50,7 +51,7 @@ class SaslParticipant { // a short string. private static final String SERVER_NAME = "0"; private static final String PROTOCOL = "hdfs"; - private static final String MECHANISM = "DIGEST-MD5"; + private static final String[] MECHANISM_ARRAY = {SaslConstants.SASL_MECHANISM}; // One of these will always be null. private final SaslServer saslServer; @@ -81,7 +82,7 @@ public static SaslParticipant createServerSaslParticipant( Map saslProps, CallbackHandler callbackHandler) throws SaslException { initializeSaslServerFactory(); - return new SaslParticipant(saslServerFactory.createSaslServer(MECHANISM, + return new SaslParticipant(saslServerFactory.createSaslServer(MECHANISM_ARRAY[0], PROTOCOL, SERVER_NAME, saslProps, callbackHandler)); } @@ -99,7 +100,7 @@ public static SaslParticipant createClientSaslParticipant(String userName, throws SaslException { initializeSaslClientFactory(); return new SaslParticipant( - saslClientFactory.createSaslClient(new String[] {MECHANISM}, userName, + saslClientFactory.createSaslClient(MECHANISM_ARRAY, userName, PROTOCOL, SERVER_NAME, saslProps, callbackHandler)); } diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/PoolAlignmentContext.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/PoolAlignmentContext.java index 1f2b12d445f32..2f7195f36b017 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/PoolAlignmentContext.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/PoolAlignmentContext.java @@ -64,7 +64,12 @@ public void updateResponseState(RpcHeaderProtos.RpcResponseHeaderProto.Builder h */ @Override public void receiveResponseState(RpcHeaderProtos.RpcResponseHeaderProto header) { - sharedGlobalStateId.accumulate(header.getStateId()); + if (header.getStateId() == 0 && sharedGlobalStateId.get() > 0) { + sharedGlobalStateId.reset(); + poolLocalStateId.reset(); + } else { + sharedGlobalStateId.accumulate(header.getStateId()); + } } /** diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterClientProtocol.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterClientProtocol.java index 7350583264b2c..d50648219050f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterClientProtocol.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterClientProtocol.java @@ -106,6 +106,7 @@ import java.net.ConnectException; import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.Comparator; import java.util.EnumSet; import java.util.HashMap; @@ -667,39 +668,28 @@ public void rename2(final String src, final String dst, public void concat(String trg, String[] src) throws IOException { rpcServer.checkOperation(NameNode.OperationCategory.WRITE); - // See if the src and target files are all in the same namespace - LocatedBlocks targetBlocks = getBlockLocations(trg, 0, 1); - if (targetBlocks == null) { - throw new IOException("Cannot locate blocks for target file - " + trg); - } - LocatedBlock lastLocatedBlock = targetBlocks.getLastLocatedBlock(); - String targetBlockPoolId = lastLocatedBlock.getBlock().getBlockPoolId(); - for (String source : src) { - LocatedBlocks sourceBlocks = getBlockLocations(source, 0, 1); - if (sourceBlocks == null) { - throw new IOException( - "Cannot located blocks for source file " + source); - } - String sourceBlockPoolId = - sourceBlocks.getLastLocatedBlock().getBlock().getBlockPoolId(); - if (!sourceBlockPoolId.equals(targetBlockPoolId)) { - throw new IOException("Cannot concatenate source file " + source - + " because it is located in a different namespace" - + " with block pool id " + sourceBlockPoolId - + " from the target file with block pool id " - + targetBlockPoolId); - } + // Concat only effects when all files in the same namespace. + RemoteLocation targetDestination = getFileRemoteLocation(trg); + if (targetDestination == null) { + throw new IOException("Cannot find target file - " + trg); } + String targetNameService = targetDestination.getNameserviceId(); - // Find locations in the matching namespace. - final RemoteLocation targetDestination = - rpcServer.getLocationForPath(trg, true, targetBlockPoolId); String[] sourceDestinations = new String[src.length]; for (int i = 0; i < src.length; i++) { String sourceFile = src[i]; - RemoteLocation location = - rpcServer.getLocationForPath(sourceFile, true, targetBlockPoolId); - sourceDestinations[i] = location.getDest(); + RemoteLocation srcLocation = getFileRemoteLocation(sourceFile); + if (srcLocation == null) { + throw new IOException("Cannot find source file - " + sourceFile); + } + sourceDestinations[i] = srcLocation.getDest(); + + if (!targetNameService.equals(srcLocation.getNameserviceId())) { + throw new IOException("Cannot concatenate source file " + sourceFile + + " because it is located in a different namespace" + " with nameservice " + + srcLocation.getNameserviceId() + " from the target file with nameservice " + + targetNameService); + } } // Invoke RemoteMethod method = new RemoteMethod("concat", @@ -1009,6 +999,28 @@ public HdfsFileStatus getFileInfo(String src) throws IOException { return ret; } + public RemoteLocation getFileRemoteLocation(String path) throws IOException { + rpcServer.checkOperation(NameNode.OperationCategory.READ); + + final List locations = rpcServer.getLocationsForPath(path, false, false); + if (locations.size() == 1) { + return locations.get(0); + } + RemoteLocation remoteLocation = null; + for (RemoteLocation location : locations) { + RemoteMethod method = + new RemoteMethod("getFileInfo", new Class[] {String.class}, new RemoteParam()); + HdfsFileStatus ret = rpcClient.invokeSequential(Collections.singletonList(location), method, + HdfsFileStatus.class, null); + if (ret != null) { + remoteLocation = location; + break; + } + } + + return remoteLocation; + } + @Override public boolean isFileClosed(String src) throws IOException { rpcServer.checkOperation(NameNode.OperationCategory.READ); diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcServer.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcServer.java index fe1323c4b5fe1..217c62ff28762 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcServer.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcServer.java @@ -1697,42 +1697,6 @@ public Long getNextSPSPath() throws IOException { return nnProto.getNextSPSPath(); } - /** - * Locate the location with the matching block pool id. - * - * @param path Path to check. - * @param failIfLocked Fail the request if locked (top mount point). - * @param blockPoolId The block pool ID of the namespace to search for. - * @return Prioritized list of locations in the federated cluster. - * @throws IOException if the location for this path cannot be determined. - */ - protected RemoteLocation getLocationForPath( - String path, boolean failIfLocked, String blockPoolId) - throws IOException { - - final List locations = - getLocationsForPath(path, failIfLocked); - - String nameserviceId = null; - Set namespaces = - this.namenodeResolver.getNamespaces(); - for (FederationNamespaceInfo namespace : namespaces) { - if (namespace.getBlockPoolId().equals(blockPoolId)) { - nameserviceId = namespace.getNameserviceId(); - break; - } - } - if (nameserviceId != null) { - for (RemoteLocation location : locations) { - if (location.getNameserviceId().equals(nameserviceId)) { - return location; - } - } - } - throw new IOException( - "Cannot locate a nameservice for block pool " + blockPoolId); - } - /** * Get the possible locations of a path in the federated cluster. * During the get operation, it will do the quota verification. diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/store/CachedRecordStore.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/store/CachedRecordStore.java index 08dcc1c6e469f..59da6145352a2 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/store/CachedRecordStore.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/store/CachedRecordStore.java @@ -189,7 +189,7 @@ public void overrideExpiredRecords(QueryResult query) throws IOException { LOG.warn("Couldn't delete State Store record {}: {}", recordName, record); } - } else if (record.checkExpired(currentDriverTime)) { + } else if (!record.isExpired() && record.checkExpired(currentDriverTime)) { String recordName = StateStoreUtils.getRecordName(record.getClass()); LOG.info("Override State Store record {}: {}", recordName, record); commitRecords.add(record); diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestObserverWithRouter.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestObserverWithRouter.java index 2f8beb20f7899..eaee5b8b14613 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestObserverWithRouter.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestObserverWithRouter.java @@ -107,10 +107,7 @@ public void teardown() throws IOException { public void startUpCluster(int numberOfObserver, Configuration confOverrides) throws Exception { int numberOfNamenode = 2 + numberOfObserver; Configuration conf = new Configuration(false); - conf.setBoolean(RBFConfigKeys.DFS_ROUTER_OBSERVER_READ_DEFAULT_KEY, true); - conf.setBoolean(DFSConfigKeys.DFS_HA_TAILEDITS_INPROGRESS_KEY, true); - conf.set(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, "0ms"); - conf.setBoolean(DFS_NAMENODE_STATE_CONTEXT_ENABLED_KEY, true); + setConfDefaults(conf); if (confOverrides != null) { confOverrides .iterator() @@ -153,6 +150,13 @@ public void startUpCluster(int numberOfObserver, Configuration confOverrides) th routerContext = cluster.getRandomRouter(); } + private void setConfDefaults(Configuration conf) { + conf.setBoolean(RBFConfigKeys.DFS_ROUTER_OBSERVER_READ_DEFAULT_KEY, true); + conf.setBoolean(DFSConfigKeys.DFS_HA_TAILEDITS_INPROGRESS_KEY, true); + conf.set(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, "0ms"); + conf.setBoolean(DFS_NAMENODE_STATE_CONTEXT_ENABLED_KEY, true); + } + public enum ConfigSetting { USE_NAMENODE_PROXY_FLAG, USE_ROUTER_OBSERVER_READ_PROXY_PROVIDER, @@ -972,4 +976,55 @@ public void testMsyncWithNoNamespacesEligibleForCRS(ConfigSetting configSetting) // There should no calls to any namespace. assertEquals("No calls to any namespace", 0, rpcCountForActive); } + + @EnumSource(ConfigSetting.class) + @ParameterizedTest + public void testRestartingNamenodeWithStateIDContextDisabled(ConfigSetting configSetting) + throws Exception { + fileSystem = routerContext.getFileSystem(getConfToEnableObserverReads(configSetting)); + Path path = new Path("/testFile1"); + // Send Create call to active + fileSystem.create(path).close(); + + // Send read request + fileSystem.open(path).close(); + + long observerCount1 = routerContext.getRouter().getRpcServer() + .getRPCMetrics().getObserverProxyOps(); + + // Restart active namenodes and disable sending state id. + restartActiveWithStateIDContextDisabled(); + + Configuration conf = getConfToEnableObserverReads(configSetting); + conf.setBoolean("fs.hdfs.impl.disable.cache", true); + FileSystem fileSystem2 = routerContext.getFileSystem(conf); + fileSystem2.msync(); + fileSystem2.open(path).close(); + + long observerCount2 = routerContext.getRouter().getRpcServer() + .getRPCMetrics().getObserverProxyOps(); + assertEquals("There should no extra calls to the observer", observerCount1, observerCount2); + + fileSystem.open(path).close(); + long observerCount3 = routerContext.getRouter().getRpcServer() + .getRPCMetrics().getObserverProxyOps(); + assertTrue("Old filesystem will send calls to observer", observerCount3 > observerCount2); + } + + void restartActiveWithStateIDContextDisabled() throws Exception { + for (int nnIndex = 0; nnIndex < cluster.getNamenodes().size(); nnIndex++) { + NameNode nameNode = cluster.getCluster().getNameNode(nnIndex); + if (nameNode != null && nameNode.isActiveState()) { + Configuration conf = new Configuration(); + setConfDefaults(conf); + cluster.getCluster().getConfiguration(nnIndex) + .setBoolean(DFS_NAMENODE_STATE_CONTEXT_ENABLED_KEY, false); + cluster.getCluster().restartNameNode(nnIndex, true); + cluster.getCluster().getNameNode(nnIndex).isActiveState(); + } + } + for (String ns : cluster.getNameservices()) { + cluster.switchToActive(ns, NAMENODES[0]); + } + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestPoolAlignmentContext.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestPoolAlignmentContext.java index ef6745654cf22..f691f61728004 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestPoolAlignmentContext.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestPoolAlignmentContext.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hdfs.server.federation.router; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.ipc.protobuf.RpcHeaderProtos.RpcResponseHeaderProto; import org.apache.hadoop.ipc.protobuf.RpcHeaderProtos.RpcRequestHeaderProto; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -50,4 +51,35 @@ private void assertRequestHeaderStateId(PoolAlignmentContext poolAlignmentContex poolAlignmentContext.updateRequestState(builder); Assertions.assertEquals(expectedValue, builder.getStateId()); } + + @Test + public void testWhenNamenodeStopsSendingStateId() { + RouterStateIdContext routerStateIdContext = new RouterStateIdContext(new Configuration()); + String namespaceId = "namespace1"; + PoolAlignmentContext poolContext = new PoolAlignmentContext(routerStateIdContext, namespaceId); + + poolContext.receiveResponseState(getRpcResponseHeader(10L)); + // Last seen value is the one from namenode, + // but request header is the max seen by clients so far. + Assertions.assertEquals(10L, poolContext.getLastSeenStateId()); + assertRequestHeaderStateId(poolContext, Long.MIN_VALUE); + + poolContext.advanceClientStateId(10L); + assertRequestHeaderStateId(poolContext, 10L); + + // When namenode state context is disabled, it returns a stateId of zero + poolContext.receiveResponseState(getRpcResponseHeader(0)); + // Routers should reset the cached state Id to not send a stale value to the observer. + Assertions.assertEquals(Long.MIN_VALUE, poolContext.getLastSeenStateId()); + assertRequestHeaderStateId(poolContext, Long.MIN_VALUE); + } + + private RpcResponseHeaderProto getRpcResponseHeader(long stateID) { + return RpcResponseHeaderProto + .newBuilder() + .setCallId(1) + .setStatus(RpcResponseHeaderProto.RpcStatusProto.SUCCESS) + .setStateId(stateID) + .build(); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterRpc.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterRpc.java index 766a035151c2a..c84dd2ceb2060 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterRpc.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterRpc.java @@ -1163,6 +1163,21 @@ public void testProxyGetPreferedBlockSize() throws Exception { routerProtocol, nnProtocol, m, new Object[] {badPath}); } + private void testConcat( + String source, String target, boolean failureExpected, boolean verfiyException, String msg) { + boolean failure = false; + try { + // Concat test file with fill block length file via router + routerProtocol.concat(target, new String[] {source}); + } catch (IOException ex) { + failure = true; + if (verfiyException) { + assertExceptionContains(msg, ex); + } + } + assertEquals(failureExpected, failure); + } + private void testConcat( String source, String target, boolean failureExpected) { boolean failure = false; @@ -1224,6 +1239,27 @@ public void testProxyConcatFile() throws Exception { String badPath = "/unknownlocation/unknowndir"; compareResponses(routerProtocol, nnProtocol, m, new Object[] {badPath, new String[] {routerFile}}); + + // Test when concat trg is a empty file + createFile(routerFS, existingFile, existingFileSize); + String sameRouterEmptyFile = + cluster.getFederatedTestDirectoryForNS(sameNameservice) + + "_newemptyfile"; + createFile(routerFS, sameRouterEmptyFile, 0); + // Concat in same namespaces, succeeds + testConcat(existingFile, sameRouterEmptyFile, false); + FileStatus mergedStatus = getFileStatus(routerFS, sameRouterEmptyFile); + assertEquals(existingFileSize, mergedStatus.getLen()); + + // Test when concat srclist has some empty file, namenode will throw IOException. + String srcEmptyFile = cluster.getFederatedTestDirectoryForNS(sameNameservice) + "_srcEmptyFile"; + createFile(routerFS, srcEmptyFile, 0); + String targetFile = cluster.getFederatedTestDirectoryForNS(sameNameservice) + "_targetFile"; + createFile(routerFS, targetFile, existingFileSize); + // Concat in same namespaces, succeeds + testConcat(srcEmptyFile, targetFile, true, true, + "org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.HadoopIllegalArgumentException): concat: source file " + + srcEmptyFile + " is invalid or empty or underConstruction"); } @Test diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/store/TestStateStoreMembershipState.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/store/TestStateStoreMembershipState.java index f7f0970bd364e..9e3eb97853a77 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/store/TestStateStoreMembershipState.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/store/TestStateStoreMembershipState.java @@ -29,11 +29,18 @@ import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.spy; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Set; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; @@ -49,16 +56,22 @@ import org.apache.hadoop.hdfs.server.federation.store.protocol.UpdateNamenodeRegistrationRequest; import org.apache.hadoop.hdfs.server.federation.store.records.MembershipState; import org.apache.hadoop.test.GenericTestUtils; +import org.apache.hadoop.test.GenericTestUtils.DelayAnswer; import org.apache.hadoop.util.Time; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Test the basic {@link MembershipStore} membership functionality. */ public class TestStateStoreMembershipState extends TestStateStoreBase { + private static Logger LOG = LoggerFactory.getLogger( + TestStateStoreMembershipState.class); + private static MembershipStore membershipStore; @BeforeClass @@ -529,6 +542,94 @@ public void testRegistrationExpiredAndDeletion() }, 100, 3000); } + @Test + public void testRegistrationExpiredRaceCondition() + throws InterruptedException, IOException, TimeoutException, ExecutionException { + + // Populate the state store with a single NN element + // 1) ns0:nn0 - Expired + // Create a thread to refresh the cached records, pulling the expired record + // into the thread's memory + // Then insert an active record, and confirm that the refresh thread does not + // override the active record with the expired record it has in memory + + MembershipState.setDeletionMs(-1); + + MembershipState expiredReport = createRegistration( + NAMESERVICES[0], NAMENODES[0], ROUTERS[0], + FederationNamenodeServiceState.ACTIVE); + expiredReport.setDateModified(Time.monotonicNow() - 5000); + expiredReport.setState(FederationNamenodeServiceState.EXPIRED); + assertTrue(namenodeHeartbeat(expiredReport)); + + // Load cache + MembershipStore memStoreSpy = spy(membershipStore); + DelayAnswer delayer = new DelayAnswer(LOG); + doAnswer(delayer).when(memStoreSpy).overrideExpiredRecords(any()); + + ExecutorService pool = Executors.newFixedThreadPool(1); + + Future cacheRefreshFuture = pool.submit(() -> { + try { + return memStoreSpy.loadCache(true); + } catch (IOException e) { + LOG.error("Exception while loading cache:", e); + } + return false; + }); + + // Verify quorum and entry + MembershipState quorumEntry = getNamenodeRegistration( + expiredReport.getNameserviceId(), expiredReport.getNamenodeId()); + assertNull(quorumEntry); + + + MembershipState record = membershipStore.getDriver() + .get(MembershipState.class).getRecords().get(0); + assertNotNull(record); + assertEquals(ROUTERS[0], record.getRouterId()); + assertEquals(FederationNamenodeServiceState.EXPIRED, + record.getState()); + + // Insert active while the other thread refreshing it's cache + MembershipState activeReport = createRegistration( + NAMESERVICES[0], NAMENODES[0], ROUTERS[0], + FederationNamenodeServiceState.ACTIVE); + + delayer.waitForCall(); + assertTrue(namenodeHeartbeat(activeReport)); + + record = membershipStore.getDriver() + .get(MembershipState.class).getRecords().get(0); + assertNotNull(record); + assertEquals(ROUTERS[0], record.getRouterId()); + assertEquals(FederationNamenodeServiceState.ACTIVE, + record.getState()); + + quorumEntry = getExpiredNamenodeRegistration( + expiredReport.getNameserviceId(), expiredReport.getNamenodeId()); + assertNull(quorumEntry); + + // Allow the thread to finish refreshing the cache + delayer.proceed(); + assertTrue(cacheRefreshFuture.get(5, TimeUnit.SECONDS)); + + // The state store should still be the active report + record = membershipStore.getDriver() + .get(MembershipState.class).getRecords().get(0); + assertNotNull(record); + assertEquals(ROUTERS[0], record.getRouterId()); + assertEquals(FederationNamenodeServiceState.ACTIVE, + record.getState()); + + membershipStore.loadCache(true); + + quorumEntry = getExpiredNamenodeRegistration( + expiredReport.getNameserviceId(), + expiredReport.getNamenodeId()); + assertNull(quorumEntry); + } + @Test public void testNamespaceInfoWithUnavailableNameNodeRegistration() throws IOException { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/sasl/SaslDataTransferServer.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/sasl/SaslDataTransferServer.java index 1d8928f75c705..adf3a99634567 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/sasl/SaslDataTransferServer.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/sasl/SaslDataTransferServer.java @@ -241,7 +241,7 @@ public void handle(Callback[] callbacks) throws IOException, continue; // realm is ignored } else { throw new UnsupportedCallbackException(callback, - "Unrecognized SASL DIGEST-MD5 Callback: " + callback); + "Unrecognized SASL Callback: " + callback); } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/JournalNodeHttpServer.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/JournalNodeHttpServer.java index 1d29c1beb364f..4f51fe8b506e0 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/JournalNodeHttpServer.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/JournalNodeHttpServer.java @@ -78,6 +78,16 @@ void start() throws IOException { DFSConfigKeys.DFS_JOURNALNODE_KERBEROS_INTERNAL_SPNEGO_PRINCIPAL_KEY, DFSConfigKeys.DFS_JOURNALNODE_KEYTAB_FILE_KEY); + final boolean xFrameEnabled = conf.getBoolean( + DFSConfigKeys.DFS_XFRAME_OPTION_ENABLED, + DFSConfigKeys.DFS_XFRAME_OPTION_ENABLED_DEFAULT); + + final String xFrameOptionValue = conf.getTrimmed( + DFSConfigKeys.DFS_XFRAME_OPTION_VALUE, + DFSConfigKeys.DFS_XFRAME_OPTION_VALUE_DEFAULT); + + builder.configureXFrame(xFrameEnabled).setXFrameOption(xFrameOptionValue); + httpServer = builder.build(); httpServer.setAttribute(JN_ATTRIBUTE_KEY, localJournalNode); httpServer.setAttribute(JspHelper.CURRENT_CONF, conf); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPOfferService.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPOfferService.java index 1a2c024c904c0..11489e919c493 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPOfferService.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPOfferService.java @@ -31,6 +31,7 @@ import org.apache.hadoop.hdfs.server.protocol.*; import org.apache.hadoop.hdfs.server.protocol.BlockECReconstructionCommand.BlockECReconstructionInfo; import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo.BlockStatus; +import org.apache.hadoop.thirdparty.com.google.common.base.Joiner; import org.apache.hadoop.util.Lists; import org.apache.hadoop.util.Sets; @@ -324,6 +325,12 @@ private void notifyNamenodeBlock(ExtendedBlock block, BlockStatus status, final ReceivedDeletedBlockInfo info = new ReceivedDeletedBlockInfo( block.getLocalBlock(), status, delHint); final DatanodeStorage storage = dn.getFSDataset().getStorage(storageUuid); + if (storage == null) { + LOG.warn("Trying to add RDBI for null storage UUID {}. Trace: {}", storageUuid, + Joiner.on("\n").join(Thread.currentThread().getStackTrace())); + getDataNode().getMetrics().incrNullStorageBlockReports(); + return; + } for (BPServiceActor actor : bpServices) { actor.getIbrManager().notifyNamenodeBlock(info, storage, diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockSender.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockSender.java index 5ac6ee9ccbdaf..f7cec35b80d5e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockSender.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockSender.java @@ -434,12 +434,12 @@ class BlockSender implements java.io.Closeable { blockIn = datanode.data.getBlockInputStream(block, offset); // seek to offset ris = new ReplicaInputStreams( blockIn, checksumIn, volumeRef, fileIoProvider); - } catch (IOException ioe) { + } catch (Throwable t) { IOUtils.cleanupWithLogger(null, volumeRef); IOUtils.closeStream(this); IOUtils.closeStream(blockIn); IOUtils.closeStream(checksumIn); - throw ioe; + throw t; } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java index 956f5bbe519d4..87e8eee681d1d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java @@ -4057,7 +4057,8 @@ public void checkDiskError() throws IOException { } } - private void handleVolumeFailures(Set unhealthyVolumes) { + @VisibleForTesting + public void handleVolumeFailures(Set unhealthyVolumes) { if (unhealthyVolumes.isEmpty()) { LOG.debug("handleVolumeFailures done with empty " + "unhealthyVolumes"); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNodeFaultInjector.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNodeFaultInjector.java index 372271b4fb28a..9e046cc3600df 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNodeFaultInjector.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNodeFaultInjector.java @@ -172,4 +172,10 @@ public void delayDiffRecord() {} * Just delay getMetaDataInputStream a while. */ public void delayGetMetaDataInputStream() {} + + /** + * Used in {@link DirectoryScanner#reconcile()} to wait until a storage is removed, + * leaving a stale copy of {@link DirectoryScanner#diffs}. + */ + public void waitUntilStorageRemoved() {} } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DirectoryScanner.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DirectoryScanner.java index 30a2d2e58431d..a99f3d78e2ba1 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DirectoryScanner.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DirectoryScanner.java @@ -466,7 +466,7 @@ void shutdown() { public void reconcile() throws IOException { LOG.debug("reconcile start DirectoryScanning"); scan(); - + DataNodeFaultInjector.get().waitUntilStorageRemoved(); // HDFS-14476: run checkAndUpdate with batch to avoid holding the lock too // long int loopCount = 0; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ProfilingFileIoEvents.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ProfilingFileIoEvents.java index c22401b645f14..dbaf047ef3e4b 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ProfilingFileIoEvents.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ProfilingFileIoEvents.java @@ -80,7 +80,7 @@ public void afterMetadataOp(@Nullable FsVolumeSpi volume, public long beforeFileIo(@Nullable FsVolumeSpi volume, FileIoProvider.OPERATION op, long len) { - if (isEnabled && ThreadLocalRandom.current().nextInt() < sampleRangeMax) { + if (isEnabled && ThreadLocalRandom.current().nextInt(Integer.MAX_VALUE) < sampleRangeMax) { DataNodeVolumeMetrics metrics = getVolumeMetrics(volume); if (metrics != null) { return Time.monotonicNow(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java index 5be095118fc38..0ca222c083c9b 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java @@ -2745,8 +2745,12 @@ public void checkAndUpdate(String bpid, ScanInfo scanInfo) curDirScannerNotifyCount = 0; lastDirScannerNotifyTime = startTimeMs; } - try (AutoCloseableLock lock = lockManager.writeLock(LockLevel.VOLUME, bpid, - vol.getStorageID())) { + String storageUuid = vol.getStorageID(); + try (AutoCloseableLock lock = lockManager.writeLock(LockLevel.VOLUME, bpid, storageUuid)) { + if (!storageMap.containsKey(storageUuid)) { + // Storage was already removed + return; + } memBlockInfo = volumeMap.get(bpid, blockId); if (memBlockInfo != null && memBlockInfo.getState() != ReplicaState.FINALIZED) { @@ -2833,7 +2837,7 @@ public void checkAndUpdate(String bpid, ScanInfo scanInfo) maxDirScannerNotifyCount++; datanode.notifyNamenodeReceivedBlock( new ExtendedBlock(bpid, diskBlockInfo), null, - vol.getStorageID(), vol.isTransientStorage()); + storageUuid, vol.isTransientStorage()); } if (vol.isTransientStorage()) { long lockedBytesReserved = diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeImpl.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeImpl.java index 47f0a3556aad0..6b026823f19f9 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeImpl.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeImpl.java @@ -1291,7 +1291,9 @@ public ReplicaInPipeline append(String bpid, ReplicaInfo replicaInfo, // rename meta file to rbw directory // rename block file to rbw directory + long oldReplicaLength = replicaInfo.getNumBytes() + replicaInfo.getMetadataLength(); newReplicaInfo.moveReplicaFrom(replicaInfo, newBlkFile); + getBlockPoolSlice(bpid).decDfsUsed(oldReplicaLength); reserveSpaceForReplica(bytesReserved); return newReplicaInfo; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/metrics/DataNodeMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/metrics/DataNodeMetrics.java index 2e902f694a12d..832a8029f7771 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/metrics/DataNodeMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/metrics/DataNodeMetrics.java @@ -185,6 +185,8 @@ public class DataNodeMetrics { private MutableCounterLong numProcessedCommands; @Metric("Rate of processed commands of all BPServiceActors") private MutableRate processedCommandsOp; + @Metric("Number of blocks in IBRs that failed due to null storage") + private MutableCounterLong nullStorageBlockReports; // FsDatasetImpl local file process metrics. @Metric private MutableRate createRbwOp; @@ -812,4 +814,7 @@ public void incrReplaceBlockOpToOtherHost() { replaceBlockOpToOtherHost.incr(); } + public void incrNullStorageBlockReports() { + nullStorageBlockReports.incr(); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index e21c24398680d..6e5117f4db5da 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -1531,10 +1531,10 @@ void stopActiveServices() { if (dir != null && getFSImage() != null) { if (getFSImage().editLog != null) { getFSImage().editLog.close(); + // Update the fsimage with the last txid that we wrote + // so that the tailer starts from the right spot. + getFSImage().updateLastAppliedTxIdFromWritten(); } - // Update the fsimage with the last txid that we wrote - // so that the tailer starts from the right spot. - getFSImage().updateLastAppliedTxIdFromWritten(); } if (dir != null) { dir.ezManager.stopReencryptThread(); @@ -2618,9 +2618,10 @@ void unsetStoragePolicy(String src) throws IOException { * @throws IOException */ BlockStoragePolicy getStoragePolicy(String src) throws IOException { + final String operationName = "getStoragePolicy"; checkOperation(OperationCategory.READ); final FSPermissionChecker pc = getPermissionChecker(); - FSPermissionChecker.setOperationType(null); + FSPermissionChecker.setOperationType(operationName); readLock(); try { checkOperation(OperationCategory.READ); @@ -2646,9 +2647,10 @@ BlockStoragePolicy[] getStoragePolicies() throws IOException { } long getPreferredBlockSize(String src) throws IOException { + final String operationName = "getPreferredBlockSize"; checkOperation(OperationCategory.READ); final FSPermissionChecker pc = getPermissionChecker(); - FSPermissionChecker.setOperationType(null); + FSPermissionChecker.setOperationType(operationName); readLock(); try { checkOperation(OperationCategory.READ); @@ -2709,6 +2711,8 @@ HdfsFileStatus startFile(String src, PermissionStatus permissions, String storagePolicy, boolean logRetryCache) throws IOException { HdfsFileStatus status; + final String operationName = "create"; + FSPermissionChecker.setOperationType(operationName); try { status = startFileInt(src, permissions, holder, clientMachine, flag, createParent, replication, blockSize, supportedVersions, ecPolicyName, @@ -2764,7 +2768,6 @@ private HdfsFileStatus startFileInt(String src, checkOperation(OperationCategory.WRITE); final FSPermissionChecker pc = getPermissionChecker(); - FSPermissionChecker.setOperationType(null); writeLock(); try { checkOperation(OperationCategory.WRITE); @@ -2857,9 +2860,10 @@ private HdfsFileStatus startFileInt(String src, boolean recoverLease(String src, String holder, String clientMachine) throws IOException { boolean skipSync = false; + final String operationName = "recoverLease"; checkOperation(OperationCategory.WRITE); final FSPermissionChecker pc = getPermissionChecker(); - FSPermissionChecker.setOperationType(null); + FSPermissionChecker.setOperationType(operationName); writeLock(); try { checkOperation(OperationCategory.WRITE); @@ -3105,9 +3109,10 @@ LocatedBlock getAdditionalDatanode(String src, long fileId, final byte storagePolicyID; final List chosen; final BlockType blockType; + final String operationName = "getAdditionalDatanode"; checkOperation(OperationCategory.WRITE); final FSPermissionChecker pc = getPermissionChecker(); - FSPermissionChecker.setOperationType(null); + FSPermissionChecker.setOperationType(operationName); readLock(); try { // Changing this operation category to WRITE instead of making getAdditionalDatanode as a @@ -3155,10 +3160,11 @@ LocatedBlock getAdditionalDatanode(String src, long fileId, */ void abandonBlock(ExtendedBlock b, long fileId, String src, String holder) throws IOException { + final String operationName = "abandonBlock"; NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: {} of file {}", b, src); checkOperation(OperationCategory.WRITE); final FSPermissionChecker pc = getPermissionChecker(); - FSPermissionChecker.setOperationType(null); + FSPermissionChecker.setOperationType(operationName); writeLock(); try { checkOperation(OperationCategory.WRITE); @@ -3222,9 +3228,10 @@ boolean completeFile(final String src, String holder, ExtendedBlock last, long fileId) throws IOException { boolean success = false; + final String operationName = "completeFile"; checkOperation(OperationCategory.WRITE); final FSPermissionChecker pc = getPermissionChecker(); - FSPermissionChecker.setOperationType(null); + FSPermissionChecker.setOperationType(operationName); writeLock(); try { checkOperation(OperationCategory.WRITE); @@ -3666,10 +3673,11 @@ void setQuota(String src, long nsQuota, long ssQuota, StorageType type) */ void fsync(String src, long fileId, String clientName, long lastBlockLength) throws IOException { + final String operationName = "fsync"; NameNode.stateChangeLog.info("BLOCK* fsync: " + src + " for " + clientName); checkOperation(OperationCategory.WRITE); final FSPermissionChecker pc = getPermissionChecker(); - FSPermissionChecker.setOperationType(null); + FSPermissionChecker.setOperationType(operationName); writeLock(); try { checkOperation(OperationCategory.WRITE); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSAdmin.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSAdmin.java index b57c146023c69..8e57bcb3beb7a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSAdmin.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSAdmin.java @@ -2095,22 +2095,20 @@ int getReconfigurationStatus(final String nodeType, final String address, final if (errMsg != null) { err.println(errMsg); return 1; - } else { - out.print(outMsg); } if (status != null) { if (!status.hasTask()) { - out.println("no task was found."); + out.println(outMsg + "no task was found."); return 0; } - out.print("started at " + new Date(status.getStartTime())); + String startMsg = outMsg + "started at " + new Date(status.getStartTime()); if (!status.stopped()) { - out.println(" and is still running."); + out.println(startMsg + " and is still running."); return 0; } - out.println(" and finished at " + out.println(startMsg + " and finished at " + new Date(status.getEndTime()).toString() + "."); if (status.getStatus() == null) { // Nothing to report. @@ -2133,6 +2131,7 @@ int getReconfigurationStatus(final String nodeType, final String address, final } } } else { + out.println(outMsg); return 1; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/qjournal/server/TestJournalNodeHttpServerXFrame.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/qjournal/server/TestJournalNodeHttpServerXFrame.java new file mode 100644 index 0000000000000..c870fcceef4fc --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/qjournal/server/TestJournalNodeHttpServerXFrame.java @@ -0,0 +1,86 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package org.apache.hadoop.hdfs.qjournal.server; + +import java.io.IOException; +import java.net.HttpURLConnection; +import java.net.URL; + +import org.junit.After; +import org.junit.Assert; +import org.junit.Test; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.qjournal.MiniJournalCluster; +import org.apache.hadoop.http.HttpServer2; + +/** + * Test that X-Frame-Options works correctly with JournalNodeHttpServer. + */ +public class TestJournalNodeHttpServerXFrame { + + private static final int NUM_JN = 1; + + private MiniJournalCluster cluster; + + @Test + public void testJournalNodeXFrameOptionsEnabled() throws Exception { + boolean xFrameEnabled = true; + cluster = createCluster(xFrameEnabled); + HttpURLConnection conn = getConn(cluster); + String xfoHeader = conn.getHeaderField("X-FRAME-OPTIONS"); + Assert.assertTrue("X-FRAME-OPTIONS is absent in the header", xfoHeader != null); + Assert.assertTrue(xfoHeader.endsWith(HttpServer2.XFrameOption.SAMEORIGIN.toString())); + } + + @Test + public void testJournalNodeXFrameOptionsDisabled() throws Exception { + boolean xFrameEnabled = false; + cluster = createCluster(xFrameEnabled); + HttpURLConnection conn = getConn(cluster); + String xfoHeader = conn.getHeaderField("X-FRAME-OPTIONS"); + System.out.println(xfoHeader); + Assert.assertTrue("unexpected X-FRAME-OPTION in header", xfoHeader == null); + } + + @After + public void cleanup() throws IOException { + if (cluster != null) { + cluster.shutdown(); + cluster = null; + } + } + + private static MiniJournalCluster createCluster(boolean enabled) throws IOException { + Configuration conf = new Configuration(); + conf.setBoolean(DFSConfigKeys.DFS_XFRAME_OPTION_ENABLED, enabled); + MiniJournalCluster jCluster = + new MiniJournalCluster.Builder(conf).format(true).numJournalNodes(NUM_JN).build(); + jCluster.waitActive(); + return jCluster; + } + + private static HttpURLConnection getConn(MiniJournalCluster journalCluster) throws IOException { + JournalNode journalNode = journalCluster.getJournalNode(0); + URL newURL = new URL(journalNode.getHttpServerURI()); + HttpURLConnection conn = (HttpURLConnection) newURL.openConnection(); + conn.connect(); + return conn; + } +} \ No newline at end of file diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBPOfferService.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBPOfferService.java index 65855427d7253..fd1b5609b1f0a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBPOfferService.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBPOfferService.java @@ -136,6 +136,7 @@ public class TestBPOfferService { private FsDatasetSpi mockFSDataset; private DataSetLockManager dataSetLockManager = new DataSetLockManager(); private boolean isSlownode; + private String mockStorageID; @Before public void setupMocks() throws Exception { @@ -157,6 +158,7 @@ public void setupMocks() throws Exception { // Set up a simulated dataset with our fake BP mockFSDataset = Mockito.spy(new SimulatedFSDataset(null, conf)); mockFSDataset.addBlockPool(FAKE_BPID, conf); + mockStorageID = ((SimulatedFSDataset) mockFSDataset).getStorages().get(0).getStorageUuid(); // Wire the dataset to the DN. Mockito.doReturn(mockFSDataset).when(mockDn).getFSDataset(); @@ -289,7 +291,7 @@ public void testBasicFunctionality() throws Exception { waitForBlockReport(mockNN2); // When we receive a block, it should report it to both NNs - bpos.notifyNamenodeReceivedBlock(FAKE_BLOCK, null, "", false); + bpos.notifyNamenodeReceivedBlock(FAKE_BLOCK, null, mockStorageID, false); ReceivedDeletedBlockInfo[] ret = waitForBlockReceived(FAKE_BLOCK, mockNN1); assertEquals(1, ret.length); @@ -1099,7 +1101,7 @@ public void testRefreshNameNodes() throws Exception { waitForBlockReport(mockNN2); // When we receive a block, it should report it to both NNs - bpos.notifyNamenodeReceivedBlock(FAKE_BLOCK, null, "", false); + bpos.notifyNamenodeReceivedBlock(FAKE_BLOCK, null, mockStorageID, false); ReceivedDeletedBlockInfo[] ret = waitForBlockReceived(FAKE_BLOCK, mockNN1); @@ -1140,7 +1142,7 @@ public void testRefreshNameNodes() throws Exception { Mockito.verify(mockNN3).registerDatanode(Mockito.any()); // When we receive a block, it should report it to both NNs - bpos.notifyNamenodeReceivedBlock(FAKE_BLOCK, null, "", false); + bpos.notifyNamenodeReceivedBlock(FAKE_BLOCK, null, mockStorageID, false); // veridfy new NN recieved block report ret = waitForBlockReceived(FAKE_BLOCK, mockNN3); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDirectoryScanner.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDirectoryScanner.java index 96b32639632f6..3392410d1fe0d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDirectoryScanner.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDirectoryScanner.java @@ -37,9 +37,11 @@ import java.nio.channels.FileChannel; import java.util.ArrayList; import java.util.Collection; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Random; +import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; @@ -1420,4 +1422,50 @@ private void writeFile(FileSystem fs, int numFiles) throws IOException { DFSTestUtil.createFile(fs, filePath, 1, (short) 1, 0); } } + + @Test(timeout = 30000) + public void testNullStorage() throws Exception { + DataNodeFaultInjector oldInjector = DataNodeFaultInjector.get(); + + Configuration conf = getConfiguration(); + conf.setInt(DFSConfigKeys.DFS_DATANODE_FAILED_VOLUMES_TOLERATED_KEY, 1); + cluster = new MiniDFSCluster.Builder(conf).build(); + try { + cluster.waitActive(); + bpid = cluster.getNamesystem().getBlockPoolId(); + fds = DataNodeTestUtils.getFSDataset(cluster.getDataNodes().get(0)); + client = cluster.getFileSystem().getClient(); + conf.setInt(DFSConfigKeys.DFS_DATANODE_DIRECTORYSCAN_THREADS_KEY, 1); + createFile(GenericTestUtils.getMethodName(), BLOCK_LENGTH, false); + // Make sure checkAndUpdate will run + truncateBlockFile(); + + // Mock a volume corruption after DirectoryScanner.scan() but before checkAndUpdate() + FsVolumeImpl volumeToRemove = fds.getVolumeList().get(0); + DataNodeFaultInjector injector = new DataNodeFaultInjector() { + @Override + public void waitUntilStorageRemoved() { + Set volumesToRemove = new HashSet<>(); + volumesToRemove.add(volumeToRemove); + cluster.getDataNodes().get(0).handleVolumeFailures(volumesToRemove); + } + }; + DataNodeFaultInjector.set(injector); + + GenericTestUtils.LogCapturer logCapturer = + GenericTestUtils.LogCapturer.captureLogs(DataNode.LOG); + scanner = new DirectoryScanner(fds, conf); + scanner.setRetainDiffs(true); + scanner.reconcile(); + assertFalse(logCapturer.getOutput() + .contains("Trying to add RDBI for null storage UUID " + volumeToRemove.getStorageID())); + } finally { + if (scanner != null) { + scanner.shutdown(); + scanner = null; + } + cluster.shutdown(); + DataNodeFaultInjector.set(oldInjector); + } + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestFsDatasetImpl.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestFsDatasetImpl.java index 5468473d9de0b..dd85ab6328c9c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestFsDatasetImpl.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestFsDatasetImpl.java @@ -2102,4 +2102,46 @@ public void delayGetMetaDataInputStream() { DataNodeFaultInjector.set(oldDnInjector); } } + + @Test(timeout = 30000) + public void testAppend() { + MiniDFSCluster cluster = null; + try { + cluster = new MiniDFSCluster.Builder(conf) + .numDataNodes(1) + .storageTypes(new StorageType[]{StorageType.DISK, StorageType.DISK}) + .storagesPerDatanode(2) + .build(); + FileSystem fs = cluster.getFileSystem(); + DataNode dataNode = cluster.getDataNodes().get(0); + + // Create test file + Path filePath = new Path("testData"); + FsDatasetImpl fsDataSetImpl = (FsDatasetImpl) dataNode.getFSDataset(); + DFSTestUtil.createFile(fs, filePath, 100, (short) 1, 0); + ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, filePath); + ReplicaInfo replicaInfo = fsDataSetImpl.getReplicaInfo(block); + long oldMetaLength = replicaInfo.getMetadataLength(); + long oldDfsUsed = fsDataSetImpl.getDfsUsed(); + + // Append to file + int appendLength = 100; + DFSTestUtil.appendFile(fs, filePath, appendLength); + + block = DFSTestUtil.getFirstBlock(fs, filePath); + replicaInfo = fsDataSetImpl.getReplicaInfo(block); + long newMetaLength = replicaInfo.getMetadataLength(); + long newDfsUsed = fsDataSetImpl.getDfsUsed(); + + assert newDfsUsed == oldDfsUsed + appendLength + (newMetaLength - oldMetaLength) : + "When appending a file, the dfsused statistics of datanode are incorrect."; + } catch (Exception ex) { + LOG.info("Exception in testAppend ", ex); + fail("Exception while testing testAppend "); + } finally { + if (cluster.isClusterUp()) { + cluster.shutdown(); + } + } + } } diff --git a/hadoop-mapreduce-project/bin/mapred b/hadoop-mapreduce-project/bin/mapred index 3e52556a08f0b..e3f1f924edddd 100755 --- a/hadoop-mapreduce-project/bin/mapred +++ b/hadoop-mapreduce-project/bin/mapred @@ -37,6 +37,7 @@ function hadoop_usage hadoop_add_subcommand "frameworkuploader" admin "mapreduce framework upload" hadoop_add_subcommand "version" client "print the version" hadoop_add_subcommand "minicluster" client "CLI MiniCluster" + hadoop_add_subcommand "successfile" client "Print a _SUCCESS manifest from the manifest and S3A committers" hadoop_generate_usage "${HADOOP_SHELL_EXECNAME}" true } @@ -102,6 +103,9 @@ function mapredcmd_case version) HADOOP_CLASSNAME=org.apache.hadoop.util.VersionInfo ;; + successfile) + HADOOP_CLASSNAME=org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestPrinter + ;; minicluster) hadoop_add_classpath "${HADOOP_YARN_HOME}/${YARN_DIR}/timelineservice"'/*' hadoop_add_classpath "${HADOOP_YARN_HOME}/${YARN_DIR}/test"'/*' diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterConfig.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterConfig.java index 8a1ae0fcc9810..54d3799cb3cf4 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterConfig.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterConfig.java @@ -21,6 +21,9 @@ import java.io.IOException; import java.util.Objects; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import org.apache.commons.lang3.tuple.Pair; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -51,6 +54,9 @@ */ public final class ManifestCommitterConfig implements IOStatisticsSource { + private static final Logger LOG = LoggerFactory.getLogger( + ManifestCommitterConfig.class); + /** * Final destination of work. * This is unqualified. @@ -153,6 +159,12 @@ public final class ManifestCommitterConfig implements IOStatisticsSource { */ private final int writerQueueCapacity; + /** + * How many attempts to save a task manifest by save and rename + * before giving up. + */ + private final int saveManifestAttempts; + /** * Constructor. * @param outputPath destination path of the job. @@ -198,6 +210,14 @@ public final class ManifestCommitterConfig implements IOStatisticsSource { this.writerQueueCapacity = conf.getInt( OPT_WRITER_QUEUE_CAPACITY, DEFAULT_WRITER_QUEUE_CAPACITY); + int attempts = conf.getInt(OPT_MANIFEST_SAVE_ATTEMPTS, + OPT_MANIFEST_SAVE_ATTEMPTS_DEFAULT); + if (attempts < 1) { + LOG.warn("Invalid value for {}: {}", + OPT_MANIFEST_SAVE_ATTEMPTS, attempts); + attempts = 1; + } + this.saveManifestAttempts = attempts; // if constructed with a task attempt, build the task ID and path. if (context instanceof TaskAttemptContext) { @@ -332,6 +352,10 @@ public String getName() { return name; } + public int getSaveManifestAttempts() { + return saveManifestAttempts; + } + /** * Get writer queue capacity. * @return the queue capacity diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterConstants.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterConstants.java index dc5ccb2e1df3a..8f359e45000f3 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterConstants.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterConstants.java @@ -132,7 +132,9 @@ public final class ManifestCommitterConstants { * Should dir cleanup do parallel deletion of task attempt dirs * before trying to delete the toplevel dirs. * For GCS this may deliver speedup, while on ABFS it may avoid - * timeouts in certain deployments. + * timeouts in certain deployments, something + * {@link #OPT_CLEANUP_PARALLEL_DELETE_BASE_FIRST} + * can alleviate. * Value: {@value}. */ public static final String OPT_CLEANUP_PARALLEL_DELETE = @@ -143,6 +145,20 @@ public final class ManifestCommitterConstants { */ public static final boolean OPT_CLEANUP_PARALLEL_DELETE_DIRS_DEFAULT = true; + /** + * Should parallel cleanup try to delete the base first? + * Best for azure as it skips the task attempt deletions unless + * the toplevel delete fails. + * Value: {@value}. + */ + public static final String OPT_CLEANUP_PARALLEL_DELETE_BASE_FIRST = + OPT_PREFIX + "cleanup.parallel.delete.base.first"; + + /** + * Default value of option {@link #OPT_CLEANUP_PARALLEL_DELETE_BASE_FIRST}: {@value}. + */ + public static final boolean OPT_CLEANUP_PARALLEL_DELETE_BASE_FIRST_DEFAULT = false; + /** * Threads to use for IO. */ @@ -260,6 +276,19 @@ public final class ManifestCommitterConstants { */ public static final int DEFAULT_WRITER_QUEUE_CAPACITY = OPT_IO_PROCESSORS_DEFAULT; + /** + * How many attempts to save a task manifest by save and rename + * before giving up. + * Value: {@value}. + */ + public static final String OPT_MANIFEST_SAVE_ATTEMPTS = + OPT_PREFIX + "manifest.save.attempts"; + + /** + * Default value of {@link #OPT_MANIFEST_SAVE_ATTEMPTS}: {@value}. + */ + public static final int OPT_MANIFEST_SAVE_ATTEMPTS_DEFAULT = 5; + private ManifestCommitterConstants() { } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterStatisticNames.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterStatisticNames.java index 243fd6087328d..2326259a08966 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterStatisticNames.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterStatisticNames.java @@ -187,6 +187,12 @@ public final class ManifestCommitterStatisticNames { public static final String OP_SAVE_TASK_MANIFEST = "task_stage_save_task_manifest"; + /** + * Save a summary file: {@value}. + */ + public static final String OP_SAVE_SUMMARY_FILE = + "task_stage_save_summary_file"; + /** * Task abort: {@value}. */ @@ -259,6 +265,9 @@ public final class ManifestCommitterStatisticNames { public static final String OP_STAGE_TASK_SCAN_DIRECTORY = "task_stage_scan_directory"; + /** Delete a directory: {@value}. */ + public static final String OP_DELETE_DIR = "op_delete_dir"; + private ManifestCommitterStatisticNames() { } } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/ManifestPrinter.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/ManifestPrinter.java index c95ec7b11be05..f12f80c641268 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/ManifestPrinter.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/files/ManifestPrinter.java @@ -36,7 +36,7 @@ */ public class ManifestPrinter extends Configured implements Tool { - private static final String USAGE = "ManifestPrinter "; + private static final String USAGE = "successfile "; /** * Output for printing. @@ -88,7 +88,7 @@ public ManifestSuccessData loadAndPrintManifest(FileSystem fs, Path path) return success; } - private void printManifest(ManifestSuccessData success) { + public void printManifest(ManifestSuccessData success) { field("succeeded", success.getSuccess()); field("created", success.getDate()); field("committer", success.getCommitter()); diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/InternalConstants.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/InternalConstants.java index 15f9899f3551e..c90ea39d0c7fe 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/InternalConstants.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/InternalConstants.java @@ -73,6 +73,7 @@ private InternalConstants() { OP_CREATE_ONE_DIRECTORY, OP_DIRECTORY_SCAN, OP_DELETE, + OP_DELETE_DIR, OP_DELETE_FILE_UNDER_DESTINATION, OP_GET_FILE_STATUS, OP_IS_DIRECTORY, @@ -85,6 +86,7 @@ private InternalConstants() { OP_MSYNC, OP_PREPARE_DIR_ANCESTORS, OP_RENAME_FILE, + OP_SAVE_SUMMARY_FILE, OP_SAVE_TASK_MANIFEST, OBJECT_LIST_REQUEST, @@ -127,4 +129,11 @@ private InternalConstants() { /** Schemas of filesystems we know to not work with this committer. */ public static final Set UNSUPPORTED_FS_SCHEMAS = ImmutableSet.of("s3a", "wasb"); + + /** + * Interval in milliseconds between save retries. + * Value {@value} milliseconds. + */ + public static final int SAVE_SLEEP_INTERVAL = 500; + } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/ManifestStoreOperations.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/ManifestStoreOperations.java index b81fa9dd32add..03e3ce0f0ade0 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/ManifestStoreOperations.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/ManifestStoreOperations.java @@ -97,6 +97,35 @@ public boolean isFile(Path path) throws IOException { public abstract boolean delete(Path path, boolean recursive) throws IOException; + /** + * Forward to {@code delete(Path, true)} + * unless overridden. + *

+ * If it returns without an error: there is no file at + * the end of the path. + * @param path path + * @return outcome + * @throws IOException failure. + */ + public boolean deleteFile(Path path) + throws IOException { + return delete(path, false); + } + + /** + * Call {@code FileSystem#delete(Path, true)} or equivalent. + *

+ * If it returns without an error: there is nothing at + * the end of the path. + * @param path path + * @return outcome + * @throws IOException failure. + */ + public boolean deleteRecursive(Path path) + throws IOException { + return delete(path, true); + } + /** * Forward to {@link FileSystem#mkdirs(Path)}. * Usual "what does 'false' mean" ambiguity. diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/ManifestStoreOperationsThroughFileSystem.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/ManifestStoreOperationsThroughFileSystem.java index 9a0b972bc735b..ab3a6398de114 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/ManifestStoreOperationsThroughFileSystem.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/ManifestStoreOperationsThroughFileSystem.java @@ -108,6 +108,11 @@ public boolean delete(Path path, boolean recursive) return fileSystem.delete(path, recursive); } + @Override + public boolean deleteRecursive(final Path path) throws IOException { + return fileSystem.delete(path, true); + } + @Override public boolean mkdirs(Path path) throws IOException { diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/AbortTaskStage.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/AbortTaskStage.java index c2b44c2a924fd..0ab7c08dc2386 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/AbortTaskStage.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/AbortTaskStage.java @@ -25,6 +25,7 @@ import org.apache.hadoop.fs.Path; +import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_DELETE_DIR; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_TASK_ABORT_TASK; /** @@ -55,7 +56,11 @@ protected Path executeStage(final Boolean suppressExceptions) final Path dir = getTaskAttemptDir(); if (dir != null) { LOG.info("{}: Deleting task attempt directory {}", getName(), dir); - deleteDir(dir, suppressExceptions); + if (suppressExceptions) { + deleteRecursiveSuppressingExceptions(dir, OP_DELETE_DIR); + } else { + deleteRecursive(dir, OP_DELETE_DIR); + } } return dir; } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/AbstractJobOrTaskStage.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/AbstractJobOrTaskStage.java index 161153c82faac..76bc0d7cd2799 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/AbstractJobOrTaskStage.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/AbstractJobOrTaskStage.java @@ -21,7 +21,9 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.time.Duration; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.function.Supplier; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -33,6 +35,7 @@ import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.fs.statistics.DurationTracker; import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore; +import org.apache.hadoop.io.retry.RetryPolicy; import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData; import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.FileEntry; import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest; @@ -53,14 +56,18 @@ import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.createTracker; import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDuration; import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDurationOfInvocation; +import static org.apache.hadoop.io.retry.RetryPolicies.retryUpToMaximumCountWithProportionalSleep; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.MANIFEST_SUFFIX; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_COMMIT_FILE_RENAME; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_COMMIT_FILE_RENAME_RECOVERED; +import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_DELETE_DIR; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_LOAD_MANIFEST; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_MSYNC; +import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_RENAME_DIR; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_RENAME_FILE; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_SAVE_TASK_MANIFEST; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.AuditingIntegration.enterStageWorker; +import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.InternalConstants.SAVE_SLEEP_INTERVAL; /** * A Stage in Task/Job Commit. @@ -366,6 +373,7 @@ public final IOStatisticsStore getIOStatistics() { */ protected final void progress() { if (stageConfig.getProgressable() != null) { + LOG.trace("{}: Progressing", getName()); stageConfig.getProgressable().progress(); } } @@ -424,7 +432,7 @@ protected final boolean isFile( * @return status or null * @throws IOException IO Failure. */ - protected final boolean delete( + public final boolean delete( final Path path, final boolean recursive) throws IOException { @@ -440,14 +448,34 @@ protected final boolean delete( * @return status or null * @throws IOException IO Failure. */ - protected Boolean delete( + public Boolean delete( final Path path, final boolean recursive, final String statistic) throws IOException { - return trackDuration(getIOStatistics(), statistic, () -> { - return operations.delete(path, recursive); - }); + if (recursive) { + return deleteRecursive(path, statistic); + } else { + return deleteFile(path, statistic); + } + } + + /** + * Delete a file at a path. + *

+ * If it returns without an error: there is nothing at + * the end of the path. + * @param path path + * @param statistic statistic to update + * @return outcome. + * @throws IOException IO Failure. + */ + public boolean deleteFile( + final Path path, + final String statistic) + throws IOException { + return trackDuration(getIOStatistics(), statistic, () -> + operations.deleteFile(path)); } /** @@ -457,7 +485,7 @@ protected Boolean delete( * @return true if the directory was created/exists. * @throws IOException IO Failure. */ - protected final boolean mkdirs( + public final boolean mkdirs( final Path path, final boolean escalateFailure) throws IOException { @@ -494,7 +522,7 @@ protected final RemoteIterator listStatusIterator( * @return the manifest. * @throws IOException IO Failure. */ - protected final TaskManifest loadManifest( + public final TaskManifest loadManifest( final FileStatus status) throws IOException { LOG.trace("{}: loadManifest('{}')", getName(), status); @@ -582,19 +610,123 @@ protected final Path directoryMustExist( * Save a task manifest or summary. This will be done by * writing to a temp path and then renaming. * If the destination path exists: Delete it. + * This will retry so that a rename failure from abfs load or IO errors + * will not fail the task. * @param manifestData the manifest/success file * @param tempPath temp path for the initial save * @param finalPath final path for rename. - * @throws IOException failure to load/parse + * @return the manifest saved. + * @throws IOException failure to rename after retries. */ @SuppressWarnings("unchecked") - protected final void save(T manifestData, + protected final T save( + final T manifestData, final Path tempPath, final Path finalPath) throws IOException { - LOG.trace("{}: save('{}, {}, {}')", getName(), manifestData, tempPath, finalPath); - trackDurationOfInvocation(getIOStatistics(), OP_SAVE_TASK_MANIFEST, () -> - operations.save(manifestData, tempPath, true)); - renameFile(tempPath, finalPath); + return saveManifest(() -> manifestData, tempPath, finalPath, OP_SAVE_TASK_MANIFEST); + } + + /** + * Generate and save a task manifest or summary file. + * This is be done by writing to a temp path and then renaming. + *

+ * If the destination path exists: Delete it before the rename. + *

+ * This will retry so that a rename failure from abfs load or IO errors + * such as delete or save failure will not fail the task. + *

+ * The {@code manifestSource} supplier is invoked to get the manifest data + * on every attempt. + * This permits statistics to be updated, including those of failures. + * @param manifestSource supplier the manifest/success file + * @param tempPath temp path for the initial save + * @param finalPath final path for rename. + * @param statistic statistic to use for timing + * @return the manifest saved. + * @throws IOException failure to save/delete/rename after retries. + */ + @SuppressWarnings("unchecked") + protected final T saveManifest( + final Supplier manifestSource, + final Path tempPath, + final Path finalPath, + String statistic) throws IOException { + + int retryCount = 0; + RetryPolicy retryPolicy = retryUpToMaximumCountWithProportionalSleep( + getStageConfig().getManifestSaveAttempts(), + SAVE_SLEEP_INTERVAL, + TimeUnit.MILLISECONDS); + + boolean success = false; + T savedManifest = null; + // loop until returning a value or raising an exception + while (!success) { + try { + // get the latest manifest, which may include updated statistics + final T manifestData = requireNonNull(manifestSource.get()); + LOG.info("{}: save manifest to {} then rename as {}'); retry count={}", + getName(), tempPath, finalPath, retryCount); + trackDurationOfInvocation(getIOStatistics(), statistic, () -> { + + // delete temp path. + // even though this is written with overwrite=true, this extra recursive + // delete also handles a directory being there. + // this should not happen as no part of the commit protocol creates a directory + // -this is just a little bit of due diligence. + deleteRecursive(tempPath, OP_DELETE); + + // save the temp file. + operations.save(manifestData, tempPath, true); + // get the length and etag. + final FileStatus st = getFileStatus(tempPath); + + // commit rename of temporary file to the final path; deleting the destination first. + final CommitOutcome outcome = commitFile( + new FileEntry(tempPath, finalPath, st.getLen(), getEtag(st)), + true); + if (outcome.recovered) { + LOG.warn("Task manifest file {} committed using rename recovery", + manifestData); + } + + }); + // success: save the manifest and declare success + savedManifest = manifestData; + success = true; + } catch (IOException e) { + // failure. + // log then decide whether to sleep and retry or give up. + LOG.warn("{}: Failed to save and commit file {} renamed to {}; retry count={}", + getName(), tempPath, finalPath, retryCount, e); + // increment that count. + retryCount++; + RetryPolicy.RetryAction retryAction; + try { + retryAction = retryPolicy.shouldRetry(e, retryCount, 0, true); + } catch (Exception ex) { + // it's not clear why this probe can raise an exception; it is just + // caught and mapped to a fail. + LOG.debug("Failure in retry policy", ex); + retryAction = RetryPolicy.RetryAction.FAIL; + } + LOG.debug("{}: Retry action: {}", getName(), retryAction.action); + if (retryAction.action == RetryPolicy.RetryAction.RetryDecision.FAIL) { + // too many failures: escalate. + throw e; + } + // else, sleep + try { + LOG.info("{}: Sleeping for {} ms before retrying", + getName(), retryAction.delayMillis); + Thread.sleep(retryAction.delayMillis); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + } + } + } + // success: return the manifest which was saved. + return savedManifest; } /** @@ -609,8 +741,10 @@ public String getEtag(FileStatus status) { } /** - * Rename a file from source to dest; if the underlying FS API call - * returned false that's escalated to an IOE. + * Rename a file from source to dest. + *

+ * The destination is always deleted through a call to + * {@link #maybeDeleteDest(boolean, Path)}. * @param source source file. * @param dest dest file * @throws IOException failure @@ -618,7 +752,6 @@ public String getEtag(FileStatus status) { */ protected final void renameFile(final Path source, final Path dest) throws IOException { - maybeDeleteDest(true, dest); executeRenamingOperation("renameFile", source, dest, OP_RENAME_FILE, () -> operations.renameFile(source, dest)); @@ -637,7 +770,7 @@ protected final void renameDir(final Path source, final Path dest) maybeDeleteDest(true, dest); executeRenamingOperation("renameDir", source, dest, - OP_RENAME_FILE, () -> + OP_RENAME_DIR, () -> operations.renameDir(source, dest) ); } @@ -669,13 +802,14 @@ protected final CommitOutcome commitFile(FileEntry entry, // note any delay which took place noteAnyRateLimiting(STORE_IO_RATE_LIMITED, result.getWaitTime()); } + return new CommitOutcome(result.recovered()); } else { // commit with a simple rename; failures will be escalated. executeRenamingOperation("renameFile", source, dest, OP_COMMIT_FILE_RENAME, () -> operations.renameFile(source, dest)); + return new CommitOutcome(false); } - return new CommitOutcome(); } /** @@ -696,12 +830,15 @@ protected boolean storeSupportsResilientCommit() { */ private void maybeDeleteDest(final boolean deleteDest, final Path dest) throws IOException { - if (deleteDest && getFileStatusOrNull(dest) != null) { - - boolean deleted = delete(dest, true); - // log the outcome in case of emergency diagnostics traces - // being needed. - LOG.debug("{}: delete('{}') returned {}'", getName(), dest, deleted); + if (deleteDest) { + final FileStatus st = getFileStatusOrNull(dest); + if (st != null) { + if (st.isDirectory()) { + deleteRecursive(dest, OP_DELETE_DIR); + } else { + deleteFile(dest, OP_DELETE); + } + } } } @@ -792,6 +929,14 @@ private PathIOException escalateRenameFailure(String operation, */ public static final class CommitOutcome { + /** + * Dit the commit recover from a failure? + */ + public final boolean recovered; + + public CommitOutcome(final boolean recovered) { + this.recovered = recovered; + } } /** @@ -866,7 +1011,7 @@ protected final Path getTaskAttemptDir() { } /** - * Get the task attemptDir; raise an NPE + * Get the task attemptDir and raise an NPE * if it is null. * @return a non-null task attempt dir. */ @@ -915,26 +1060,35 @@ protected final TaskPool.Submitter getIOProcessors(int size) { } /** - * Delete a directory, possibly suppressing exceptions. + * Delete a directory (or a file). * @param dir directory. - * @param suppressExceptions should exceptions be suppressed? + * @param statistic statistic to use + * @return true if the path is no longer present. * @throws IOException exceptions raised in delete if not suppressed. - * @return any exception caught and suppressed */ - protected IOException deleteDir( + protected boolean deleteRecursive( final Path dir, - final Boolean suppressExceptions) + final String statistic) throws IOException { + return trackDuration(getIOStatistics(), statistic, () -> + operations.deleteRecursive(dir)); + } + + /** + * Delete a directory or file, catching exceptions. + * @param dir directory. + * @param statistic statistic to use + * @return any exception caught. + */ + protected IOException deleteRecursiveSuppressingExceptions( + final Path dir, + final String statistic) { try { - delete(dir, true); + deleteRecursive(dir, statistic); return null; } catch (IOException ex) { LOG.info("Error deleting {}: {}", dir, ex.toString()); - if (!suppressExceptions) { - throw ex; - } else { - return ex; - } + return ex; } } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CleanupJobStage.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CleanupJobStage.java index 77b80aaf67fd6..054ec26fb00f5 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CleanupJobStage.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CleanupJobStage.java @@ -40,7 +40,10 @@ import static org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.FILEOUTPUTCOMMITTER_CLEANUP_SKIPPED; import static org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.FILEOUTPUTCOMMITTER_CLEANUP_SKIPPED_DEFAULT; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.OPT_CLEANUP_PARALLEL_DELETE; +import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.OPT_CLEANUP_PARALLEL_DELETE_BASE_FIRST; +import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.OPT_CLEANUP_PARALLEL_DELETE_BASE_FIRST_DEFAULT; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.OPT_CLEANUP_PARALLEL_DELETE_DIRS_DEFAULT; +import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_DELETE_DIR; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_CLEANUP; /** @@ -49,7 +52,7 @@ * Returns: the outcome of the overall operation * The result is detailed purely for the benefit of tests, which need * to make assertions about error handling and fallbacks. - * + *

* There's a few known issues with the azure and GCS stores which * this stage tries to address. * - Google GCS directory deletion is O(entries), so is slower for big jobs. @@ -57,19 +60,28 @@ * when not the store owner triggers a scan down the tree to verify the * caller has the permission to delete each subdir. * If this scan takes over 90s, the operation can time out. - * + *

* The main solution for both of these is that task attempts are * deleted in parallel, in different threads. * This will speed up GCS cleanup and reduce the risk of * abfs related timeouts. * Exceptions during cleanup can be suppressed, * so that these do not cause the job to fail. - * + *

+ * There is one weakness of this design: the number of delete operations + * is 1 + number of task attempts, which, on ABFS can generate excessive + * load. + * For this reason, there is an option to attempt to delete the base directory + * first; if this does not time out then, on Azure ADLS Gen2 storage, + * this is the most efficient cleanup. + * Only if that attempt fails for any reason then the parallel delete + * phase takes place. + *

* Also, some users want to be able to run multiple independent jobs * targeting the same output directory simultaneously. * If one job deletes the directory `__temporary` all the others * will fail. - * + *

* This can be addressed by disabling cleanup entirely. * */ @@ -128,7 +140,7 @@ protected Result executeStage( stageName = getStageName(args); // this is $dest/_temporary final Path baseDir = requireNonNull(getStageConfig().getOutputTempSubDir()); - LOG.debug("{}: Cleaup of directory {} with {}", getName(), baseDir, args); + LOG.debug("{}: Cleanup of directory {} with {}", getName(), baseDir, args); if (!args.enabled) { LOG.info("{}: Cleanup of {} disabled", getName(), baseDir); return new Result(Outcome.DISABLED, baseDir, @@ -142,64 +154,105 @@ protected Result executeStage( } Outcome outcome = null; - IOException exception; + IOException exception = null; + boolean baseDirDeleted = false; // to delete. LOG.info("{}: Deleting job directory {}", getName(), baseDir); + final long directoryCount = args.directoryCount; + if (directoryCount > 0) { + // log the expected directory count, which drives duration in GCS + // and may cause timeouts on azure if the count is too high for a + // timely permissions tree scan. + LOG.info("{}: Expected directory count: {}", getName(), directoryCount); + } + progress(); + // check and maybe execute parallel delete of task attempt dirs. if (args.deleteTaskAttemptDirsInParallel) { - // Attempt to do a parallel delete of task attempt dirs; - // don't overreact if a delete fails, but stop trying - // to delete the others, and fall back to deleting the - // job dir. - Path taskSubDir - = getStageConfig().getJobAttemptTaskSubDir(); - try (DurationInfo info = new DurationInfo(LOG, - "parallel deletion of task attempts in %s", - taskSubDir)) { - RemoteIterator dirs = - RemoteIterators.filteringRemoteIterator( - listStatusIterator(taskSubDir), - FileStatus::isDirectory); - TaskPool.foreach(dirs) - .executeWith(getIOProcessors()) - .stopOnFailure() - .suppressExceptions(false) - .run(this::rmTaskAttemptDir); - getIOStatistics().aggregate((retrieveIOStatistics(dirs))); - - if (getLastDeleteException() != null) { - // one of the task attempts failed. - throw getLastDeleteException(); + + + if (args.parallelDeleteAttemptBaseDeleteFirst) { + // attempt to delete the base dir first. + // This can reduce ABFS delete load but may time out + // (which the fallback to parallel delete will handle). + // on GCS it is slow. + try (DurationInfo info = new DurationInfo(LOG, true, + "Initial delete of %s", baseDir)) { + exception = deleteOneDir(baseDir); + if (exception == null) { + // success: record this as the outcome, + outcome = Outcome.DELETED; + // and flag that the the parallel delete should be skipped because the + // base directory is alredy deleted. + baseDirDeleted = true; + } else { + // failure: log and continue + LOG.warn("{}: Exception on initial attempt at deleting base dir {}" + + " with directory count {}. Falling back to parallel delete", + getName(), baseDir, directoryCount, exception); + } + } + } + if (!baseDirDeleted) { + // no base delete attempted or it failed. + // Attempt to do a parallel delete of task attempt dirs; + // don't overreact if a delete fails, but stop trying + // to delete the others, and fall back to deleting the + // job dir. + Path taskSubDir + = getStageConfig().getJobAttemptTaskSubDir(); + try (DurationInfo info = new DurationInfo(LOG, true, + "parallel deletion of task attempts in %s", + taskSubDir)) { + RemoteIterator dirs = + RemoteIterators.filteringRemoteIterator( + listStatusIterator(taskSubDir), + FileStatus::isDirectory); + TaskPool.foreach(dirs) + .executeWith(getIOProcessors()) + .stopOnFailure() + .suppressExceptions(false) + .run(this::rmTaskAttemptDir); + getIOStatistics().aggregate((retrieveIOStatistics(dirs))); + + if (getLastDeleteException() != null) { + // one of the task attempts failed. + throw getLastDeleteException(); + } else { + // success: record this as the outcome. + outcome = Outcome.PARALLEL_DELETE; + } + } catch (FileNotFoundException ex) { + // not a problem if there's no dir to list. + LOG.debug("{}: Task attempt dir {} not found", getName(), taskSubDir); + outcome = Outcome.DELETED; + } catch (IOException ex) { + // failure. Log and continue + LOG.info( + "{}: Exception while listing/deleting task attempts under {}; continuing", + getName(), + taskSubDir, ex); } - // success: record this as the outcome. - outcome = Outcome.PARALLEL_DELETE; - } catch (FileNotFoundException ex) { - // not a problem if there's no dir to list. - LOG.debug("{}: Task attempt dir {} not found", getName(), taskSubDir); - outcome = Outcome.DELETED; - } catch (IOException ex) { - // failure. Log and continue - LOG.info( - "{}: Exception while listing/deleting task attempts under {}; continuing", - getName(), - taskSubDir, ex); - // not overreacting here as the base delete will still get executing - outcome = Outcome.DELETED; } } - // Now the top-level deletion; exception gets saved - exception = deleteOneDir(baseDir); - if (exception != null) { - // failure, report and continue - // assume failure. - outcome = Outcome.FAILURE; - } else { - // if the outcome isn't already recorded as parallel delete, - // mark is a simple delete. - if (outcome == null) { - outcome = Outcome.DELETED; + // Now the top-level deletion if not already executed; exception gets saved + if (!baseDirDeleted) { + exception = deleteOneDir(baseDir); + if (exception != null) { + // failure, report and continue + LOG.warn("{}: Exception on final attempt at deleting base dir {}" + + " with directory count {}", + getName(), baseDir, directoryCount, exception); + // assume failure. + outcome = Outcome.FAILURE; + } else { + // if the outcome isn't already recorded as parallel delete, + // mark is a simple delete. + if (outcome == null) { + outcome = Outcome.DELETED; + } } } @@ -235,7 +288,7 @@ private void rmTaskAttemptDir(FileStatus status) throws IOException { } /** - * Delete a directory. + * Delete a directory suppressing exceptions. * The {@link #deleteFailureCount} counter. * is incremented on every failure. * @param dir directory @@ -246,21 +299,22 @@ private IOException deleteOneDir(final Path dir) throws IOException { deleteDirCount.incrementAndGet(); - IOException ex = deleteDir(dir, true); - if (ex != null) { - deleteFailure(ex); - } - return ex; + return noteAnyDeleteFailure( + deleteRecursiveSuppressingExceptions(dir, OP_DELETE_DIR)); } /** - * Note a failure. + * Note a failure if the exception is not null. * @param ex exception + * @return the exception */ - private synchronized void deleteFailure(IOException ex) { - // excaption: add the count - deleteFailureCount.incrementAndGet(); - lastDeleteException = ex; + private synchronized IOException noteAnyDeleteFailure(IOException ex) { + if (ex != null) { + // exception: add the count + deleteFailureCount.incrementAndGet(); + lastDeleteException = ex; + } + return ex; } /** @@ -287,26 +341,47 @@ public static final class Arguments { /** Attempt parallel delete of task attempt dirs? */ private final boolean deleteTaskAttemptDirsInParallel; + /** + * Make an initial attempt to delete the base directory. + * This will reduce IO load on abfs. If it times out, the + * parallel delete will be the fallback. + */ + private final boolean parallelDeleteAttemptBaseDeleteFirst; + /** Ignore failures? */ private final boolean suppressExceptions; + /** + * Non-final count of directories. + * Default value, "0", means "unknown". + * This can be dynamically updated during job commit. + */ + private long directoryCount; + /** * Arguments to the stage. * @param statisticName stage name to report * @param enabled is the stage enabled? * @param deleteTaskAttemptDirsInParallel delete task attempt dirs in * parallel? + * @param parallelDeleteAttemptBaseDeleteFirst Make an initial attempt to + * delete the base directory in a parallel delete? * @param suppressExceptions suppress exceptions? + * @param directoryCount directories under job dir; 0 means unknown. */ public Arguments( final String statisticName, final boolean enabled, final boolean deleteTaskAttemptDirsInParallel, - final boolean suppressExceptions) { + final boolean parallelDeleteAttemptBaseDeleteFirst, + final boolean suppressExceptions, + long directoryCount) { this.statisticName = statisticName; this.enabled = enabled; this.deleteTaskAttemptDirsInParallel = deleteTaskAttemptDirsInParallel; this.suppressExceptions = suppressExceptions; + this.parallelDeleteAttemptBaseDeleteFirst = parallelDeleteAttemptBaseDeleteFirst; + this.directoryCount = directoryCount; } public String getStatisticName() { @@ -325,6 +400,18 @@ public boolean isSuppressExceptions() { return suppressExceptions; } + public boolean isParallelDeleteAttemptBaseDeleteFirst() { + return parallelDeleteAttemptBaseDeleteFirst; + } + + public long getDirectoryCount() { + return directoryCount; + } + + public void setDirectoryCount(final long directoryCount) { + this.directoryCount = directoryCount; + } + @Override public String toString() { return "Arguments{" + @@ -332,6 +419,7 @@ public String toString() { + ", enabled=" + enabled + ", deleteTaskAttemptDirsInParallel=" + deleteTaskAttemptDirsInParallel + + ", parallelDeleteAttemptBaseDeleteFirst=" + parallelDeleteAttemptBaseDeleteFirst + ", suppressExceptions=" + suppressExceptions + '}'; } @@ -343,8 +431,9 @@ public String toString() { public static final Arguments DISABLED = new Arguments(OP_STAGE_JOB_CLEANUP, false, false, - false - ); + false, + false, + 0); /** * Build an options argument from a configuration, using the @@ -364,12 +453,16 @@ public static Arguments cleanupStageOptionsFromConfig( boolean deleteTaskAttemptDirsInParallel = conf.getBoolean( OPT_CLEANUP_PARALLEL_DELETE, OPT_CLEANUP_PARALLEL_DELETE_DIRS_DEFAULT); + boolean parallelDeleteAttemptBaseDeleteFirst = conf.getBoolean( + OPT_CLEANUP_PARALLEL_DELETE_BASE_FIRST, + OPT_CLEANUP_PARALLEL_DELETE_BASE_FIRST_DEFAULT); return new Arguments( statisticName, enabled, deleteTaskAttemptDirsInParallel, - suppressExceptions - ); + parallelDeleteAttemptBaseDeleteFirst, + suppressExceptions, + 0); } /** diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CommitJobStage.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CommitJobStage.java index 60fc6492ee621..8e01f7f40cba9 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CommitJobStage.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CommitJobStage.java @@ -37,6 +37,7 @@ import static org.apache.commons.lang3.StringUtils.isNotBlank; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_BYTES_COMMITTED_COUNT; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_FILES_COMMITTED_COUNT; +import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.COMMITTER_TASK_DIRECTORY_COUNT_MEAN; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_COMMIT; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_CREATE_TARGET_DIRS; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_LOAD_MANIFESTS; @@ -161,7 +162,12 @@ protected CommitJobStage.Result executeStage( } // optional cleanup - new CleanupJobStage(stageConfig).apply(arguments.getCleanupArguments()); + final CleanupJobStage.Arguments cleanupArguments = arguments.getCleanupArguments(); + // determine the directory count + cleanupArguments.setDirectoryCount(iostats.counters() + .getOrDefault(COMMITTER_TASK_DIRECTORY_COUNT_MEAN, 0L)); + + new CleanupJobStage(stageConfig).apply(cleanupArguments); // and then, after everything else: optionally validate. if (arguments.isValidateOutput()) { diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CommitTaskStage.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CommitTaskStage.java index bf5ba27ab8ad5..6ac2dec06a146 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CommitTaskStage.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CommitTaskStage.java @@ -23,6 +23,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.commons.lang3.tuple.Pair; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.statistics.IOStatisticsSnapshot; import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest; @@ -69,19 +70,21 @@ protected CommitTaskStage.Result executeStage(final Void arguments) // the saving, but ... scanStage.addExecutionDurationToStatistics(getIOStatistics(), OP_STAGE_TASK_COMMIT); - // save a snapshot of the IO Statistics - final IOStatisticsSnapshot manifestStats = snapshotIOStatistics(); - manifestStats.aggregate(getIOStatistics()); - manifest.setIOStatistics(manifestStats); - - // Now save with rename - Path manifestPath = new SaveTaskManifestStage(getStageConfig()) - .apply(manifest); - return new CommitTaskStage.Result(manifestPath, manifest); + // Now save with retry, updating the statistics on every attempt. + Pair p = new SaveTaskManifestStage(getStageConfig()) + .apply(() -> { + /* save a snapshot of the IO Statistics */ + final IOStatisticsSnapshot manifestStats = snapshotIOStatistics(); + manifestStats.aggregate(getIOStatistics()); + manifest.setIOStatistics(manifestStats); + return manifest; + }); + return new CommitTaskStage.Result(p.getLeft(), p.getRight()); } /** - * Result of the stage. + * Result of the stage: the path the manifest was saved to + * and the manifest which was successfully saved. */ public static final class Result { /** The path the manifest was saved to. */ @@ -111,5 +114,9 @@ public TaskManifest getTaskManifest() { return taskManifest; } + @Override + public String toString() { + return "Result{path=" + path + '}'; + } } } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CreateOutputDirectoriesStage.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CreateOutputDirectoriesStage.java index 1618cf591a590..18dc35960eb31 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CreateOutputDirectoriesStage.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CreateOutputDirectoriesStage.java @@ -105,7 +105,7 @@ protected Result executeStage( throws IOException { final List directories = createAllDirectories(manifestDirs); - LOG.debug("{}: Created {} directories", getName(), directories.size()); + LOG.info("{}: Created {} directories", getName(), directories.size()); return new Result(new HashSet<>(directories), dirMap); } @@ -163,8 +163,9 @@ private List createAllDirectories(final Collection manifestDirs) // Now the real work. final int createCount = leaves.size(); - LOG.info("Preparing {} directory/directories; {} parent dirs implicitly created", - createCount, parents.size()); + LOG.info("Preparing {} directory/directories; {} parent dirs implicitly created." + + " Files deleted: {}", + createCount, parents.size(), filesToDelete.size()); // now probe for and create the leaf dirs, which are those at the // bottom level @@ -232,7 +233,7 @@ private void deleteDirWithFile(Path dir) throws IOException { // report progress back progress(); LOG.info("{}: Deleting file {}", getName(), dir); - delete(dir, false, OP_DELETE); + deleteFile(dir, OP_DELETE); // note its final state addToDirectoryMap(dir, DirMapState.fileNowDeleted); } @@ -323,7 +324,7 @@ private DirMapState maybeCreateOneDirectory(DirEntry dirEntry) throws IOExceptio // is bad: delete a file LOG.info("{}: Deleting file where a directory should go: {}", getName(), st); - delete(path, false, OP_DELETE_FILE_UNDER_DESTINATION); + deleteFile(path, OP_DELETE_FILE_UNDER_DESTINATION); } else { // is good. LOG.warn("{}: Even though mkdirs({}) failed, there is now a directory there", diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SaveSuccessFileStage.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SaveSuccessFileStage.java index eb9c82f2ae739..96b94e609d673 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SaveSuccessFileStage.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SaveSuccessFileStage.java @@ -28,6 +28,7 @@ import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.SUCCESS_MARKER; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.TMP_SUFFIX; +import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_SAVE_SUMMARY_FILE; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_COMMIT; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_SAVE_SUCCESS; @@ -72,7 +73,7 @@ protected Path executeStage(final ManifestSuccessData successData) LOG.debug("{}: Saving _SUCCESS file to {} via {}", successFile, getName(), successTempFile); - save(successData, successTempFile, successFile); + saveManifest(() -> successData, successTempFile, successFile, OP_SAVE_SUMMARY_FILE); return successFile; } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SaveTaskManifestStage.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SaveTaskManifestStage.java index fdaf0184cda20..179e7c22ef058 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SaveTaskManifestStage.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SaveTaskManifestStage.java @@ -19,13 +19,16 @@ package org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages; import java.io.IOException; +import java.util.function.Supplier; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.commons.lang3.tuple.Pair; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest; +import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_SAVE_TASK_MANIFEST; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_TASK_SAVE_MANIFEST; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.manifestPathForTask; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.manifestTempPathForTaskAttempt; @@ -38,16 +41,36 @@ * Uses both the task ID and task attempt ID to determine the temp filename; * Before the rename of (temp, final-path), any file at the final path * is deleted. + *

* This is so that when this stage is invoked in a task commit, its output * overwrites any of the first commit. * When it succeeds, therefore, unless there is any subsequent commit of * another task, the task manifest at the final path is from this * operation. - * - * Returns the path where the manifest was saved. + *

+ * If the save and rename fails, there are a limited number of retries, with no sleep + * interval. + * This is to briefly try recover from any transient rename() failure, including a + * race condition with any other task commit. + *

    + *
  1. If the previous task commit has already succeeded, this rename will overwrite it. + * Both task attempts will report success.
  2. + *
  3. If after, writing, another task attempt overwrites it, again, both + * task attempts will report success.
  4. + *
  5. If another task commits between the delete() and rename() operations, the retry will + * attempt to recover by repeating the manifest write, and then report success.
  6. + *
+ * This means that multiple task attempts may report success, but only one will have it actual + * manifest saved. + * The mapreduce and spark committers only schedule a second task commit attempt if the first + * task attempt's commit operation fails or fails to report success in the allocated time. + * The overwrite with retry loop is an attempt to ensure that the second attempt will report + * success, if a partitioned cluster means that the original TA commit is still in progress. + *

+ * Returns (the path where the manifest was saved, the manifest). */ public class SaveTaskManifestStage extends - AbstractJobOrTaskStage { + AbstractJobOrTaskStage, Pair> { private static final Logger LOG = LoggerFactory.getLogger( SaveTaskManifestStage.class); @@ -57,14 +80,16 @@ public SaveTaskManifestStage(final StageConfig stageConfig) { } /** - * Save the manifest to a temp file and rename to the final + * Generate and save a manifest to a temp file and rename to the final * manifest destination. - * @param manifest manifest + * The manifest is generated on each retried attempt. + * @param manifestSource supplier the manifest/success file + * * @return the path to the final entry * @throws IOException IO failure. */ @Override - protected Path executeStage(final TaskManifest manifest) + protected Pair executeStage(Supplier manifestSource) throws IOException { final Path manifestDir = getTaskManifestDir(); @@ -74,8 +99,9 @@ protected Path executeStage(final TaskManifest manifest) Path manifestTempFile = manifestTempPathForTaskAttempt(manifestDir, getRequiredTaskAttemptId()); LOG.info("{}: Saving manifest file to {}", getName(), manifestFile); - save(manifest, manifestTempFile, manifestFile); - return manifestFile; + final TaskManifest manifest = + saveManifest(manifestSource, manifestTempFile, manifestFile, OP_SAVE_TASK_MANIFEST); + return Pair.of(manifestFile, manifest); } } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SetupJobStage.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SetupJobStage.java index 9b873252df2cb..6e17aae23d201 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SetupJobStage.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SetupJobStage.java @@ -25,6 +25,7 @@ import org.apache.hadoop.fs.Path; +import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_DELETE; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_SETUP; /** @@ -55,7 +56,7 @@ protected Path executeStage(final Boolean deleteMarker) throws IOException { createNewDirectory("Creating task manifest dir", getTaskManifestDir()); // delete any success marker if so instructed. if (deleteMarker) { - delete(getStageConfig().getJobSuccessMarkerPath(), false); + deleteFile(getStageConfig().getJobSuccessMarkerPath(), OP_DELETE); } return path; } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/StageConfig.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/StageConfig.java index b716d2f4b7f0c..55ff4f888881f 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/StageConfig.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/StageConfig.java @@ -32,6 +32,7 @@ import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.DEFAULT_WRITER_QUEUE_CAPACITY; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.SUCCESS_MARKER; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.SUCCESS_MARKER_FILE_LIMIT; +import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.OPT_MANIFEST_SAVE_ATTEMPTS_DEFAULT; /** * Stage Config. @@ -172,6 +173,12 @@ public class StageConfig { */ private int successMarkerFileLimit = SUCCESS_MARKER_FILE_LIMIT; + /** + * How many attempts to save a manifest by save and rename + * before giving up: {@value}. + */ + private int manifestSaveAttempts = OPT_MANIFEST_SAVE_ATTEMPTS_DEFAULT; + public StageConfig() { } @@ -604,6 +611,21 @@ public int getSuccessMarkerFileLimit() { return successMarkerFileLimit; } + public int getManifestSaveAttempts() { + return manifestSaveAttempts; + } + + /** + * Set builder value. + * @param value new value + * @return the builder + */ + public StageConfig withManifestSaveAttempts(final int value) { + checkOpen(); + manifestSaveAttempts = value; + return this; + } + /** * Enter the stage; calls back to * {@link #enterStageEventHandler} if non-null. diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/MapredCommands.md b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/MapredCommands.md index 6c2141820d878..859f293726bd3 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/MapredCommands.md +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/MapredCommands.md @@ -134,6 +134,11 @@ Usage: `mapred envvars` Display computed Hadoop environment variables. +# `successfile` + +Load and print a JSON `_SUCCESS` file from a [Manifest Committer](manifest_committer.html) or an S3A Committer, + + Administration Commands ----------------------- diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/manifest_committer.md b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/manifest_committer.md index da199a48d14c0..0ac03080195d4 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/manifest_committer.md +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/manifest_committer.md @@ -15,14 +15,16 @@ # The Manifest Committer for Azure and Google Cloud Storage -This document how to use the _Manifest Committer_. + + +This documents how to use the _Manifest Committer_. The _Manifest_ committer is a committer for work which provides performance on ABFS for "real world" queries, and performance and correctness on GCS. It also works with other filesystems, including HDFS. However, the design is optimized for object stores where -listing operatons are slow and expensive. +listing operations are slow and expensive. The architecture and implementation of the committer is covered in [Manifest Committer Architecture](manifest_committer_architecture.html). @@ -31,10 +33,16 @@ The architecture and implementation of the committer is covered in The protocol and its correctness are covered in [Manifest Committer Protocol](manifest_committer_protocol.html). -It was added in March 2022, and should be considered unstable -in early releases. +It was added in March 2022. +As of April 2024, the problems which surfaced have been +* Memory use at scale. +* Directory deletion scalability. +* Resilience to task commit to rename failures. - +That is: the core algorithms is correct, but task commit +robustness was insufficient to some failure conditions. +And scale is always a challenge, even with components tested through +large TPC-DS test runs. ## Problem: @@ -70,10 +78,13 @@ This committer uses the extension point which came in for the S3A committers. Users can declare a new committer factory for abfs:// and gcs:// URLs. A suitably configured spark deployment will pick up the new committer. -Directory performance issues in job cleanup can be addressed by two options +Directory performance issues in job cleanup can be addressed by some options 1. The committer will parallelize deletion of task attempt directories before deleting the `_temporary` directory. -1. Cleanup can be disabled. . +2. An initial attempt to delete the `_temporary` directory before the parallel + attempt is made. +3. Exceptions can be supressed, so that cleanup failures do not fail the job +4. Cleanup can be disabled. The committer can be used with any filesystem client which has a "real" file rename() operation. @@ -112,8 +123,8 @@ These can be done in `core-site.xml`, if it is not defined in the `mapred-defaul ## Binding to the manifest committer in Spark. -In Apache Spark, the configuration can be done either with command line options (after the '--conf') or by using the `spark-defaults.conf` file. The following is an example of using `spark-defaults.conf` also including the configuration for Parquet with a subclass of the parquet -committer which uses the factory mechansim internally. +In Apache Spark, the configuration can be done either with command line options (after the `--conf`) or by using the `spark-defaults.conf` file. +The following is an example of using `spark-defaults.conf` also including the configuration for Parquet with a subclass of the parquet committer which uses the factory mechanism internally. ``` spark.hadoop.mapreduce.outputcommitter.factory.scheme.abfs org.apache.hadoop.fs.azurebfs.commit.AzureManifestCommitterFactory @@ -184,6 +195,7 @@ Here are the main configuration options of the committer. | `mapreduce.manifest.committer.io.threads` | Thread count for parallel operations | `64` | | `mapreduce.manifest.committer.summary.report.directory` | directory to save reports. | `""` | | `mapreduce.manifest.committer.cleanup.parallel.delete` | Delete temporary directories in parallel | `true` | +| `mapreduce.manifest.committer.cleanup.parallel.delete.base.first` | Attempt to delete the base directory before parallel task attempts | `false` | | `mapreduce.fileoutputcommitter.cleanup.skipped` | Skip cleanup of `_temporary` directory| `false` | | `mapreduce.fileoutputcommitter.cleanup-failures.ignored` | Ignore errors during cleanup | `false` | | `mapreduce.fileoutputcommitter.marksuccessfuljobs` | Create a `_SUCCESS` marker file on successful completion. (and delete any existing one in job setup) | `true` | @@ -238,37 +250,6 @@ Caveats are made against the store. The rate throttling option `mapreduce.manifest.committer.io.rate` can help avoid this. - -### `mapreduce.manifest.committer.writer.queue.capacity` - -This is a secondary scale option. -It controls the size of the queue for storing lists of files to rename from -the manifests loaded from the target filesystem, manifests loaded -from a pool of worker threads, and the single thread which saves -the entries from each manifest to an intermediate file in the local filesystem. - -Once the queue is full, all manifest loading threads will block. - -```xml - - mapreduce.manifest.committer.writer.queue.capacity - 32 - -``` - -As the local filesystem is usually much faster to write to than any cloud store, -this queue size should not be a limit on manifest load performance. - -It can help limit the amount of memory consumed during manifest load during -job commit. -The maximum number of loaded manifests will be: - -``` -mapreduce.manifest.committer.writer.queue.capacity + mapreduce.manifest.committer.io.threads -``` - - - ## Optional: deleting target files in Job Commit The classic `FileOutputCommitter` deletes files at the destination paths @@ -403,6 +384,153 @@ hadoop org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestP This works for the files saved at the base of an output directory, and any reports saved to a report directory. +Example from a run of the `ITestAbfsTerasort` MapReduce terasort. + +``` +bin/mapred successfile abfs://testing@ukwest.dfs.core.windows.net/terasort/_SUCCESS + +Manifest file: abfs://testing@ukwest.dfs.core.windows.net/terasort/_SUCCESS +succeeded: true +created: 2024-04-18T18:34:34.003+01:00[Europe/London] +committer: org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitter +hostname: pi5 +jobId: job_1713461587013_0003 +jobIdSource: JobID +Diagnostics + mapreduce.manifest.committer.io.threads = 192 + principal = alice + stage = committer_commit_job + +Statistics: +counters=((commit_file_rename=1) +(committer_bytes_committed=21) +(committer_commit_job=1) +(committer_files_committed=1) +(committer_task_directory_depth=2) +(committer_task_file_count=2) +(committer_task_file_size=21) +(committer_task_manifest_file_size=37157) +(job_stage_cleanup=1) +(job_stage_create_target_dirs=1) +(job_stage_load_manifests=1) +(job_stage_optional_validate_output=1) +(job_stage_rename_files=1) +(job_stage_save_success_marker=1) +(job_stage_setup=1) +(op_create_directories=1) +(op_delete=3) +(op_delete_dir=1) +(op_get_file_status=9) +(op_get_file_status.failures=6) +(op_list_status=3) +(op_load_all_manifests=1) +(op_load_manifest=2) +(op_mkdirs=4) +(op_msync=1) +(op_rename=2) +(op_rename.failures=1) +(task_stage_commit=2) +(task_stage_save_task_manifest=1) +(task_stage_scan_directory=2) +(task_stage_setup=2)); + +gauges=(); + +minimums=((commit_file_rename.min=141) +(committer_commit_job.min=2306) +(committer_task_directory_count=0) +(committer_task_directory_depth=1) +(committer_task_file_count=0) +(committer_task_file_size=0) +(committer_task_manifest_file_size=18402) +(job_stage_cleanup.min=196) +(job_stage_create_target_dirs.min=2) +(job_stage_load_manifests.min=687) +(job_stage_optional_validate_output.min=66) +(job_stage_rename_files.min=161) +(job_stage_save_success_marker.min=653) +(job_stage_setup.min=571) +(op_create_directories.min=1) +(op_delete.min=57) +(op_delete_dir.min=129) +(op_get_file_status.failures.min=57) +(op_get_file_status.min=55) +(op_list_status.min=202) +(op_load_all_manifests.min=445) +(op_load_manifest.min=171) +(op_mkdirs.min=67) +(op_msync.min=0) +(op_rename.failures.min=266) +(op_rename.min=139) +(task_stage_commit.min=206) +(task_stage_save_task_manifest.min=651) +(task_stage_scan_directory.min=206) +(task_stage_setup.min=127)); + +maximums=((commit_file_rename.max=141) +(committer_commit_job.max=2306) +(committer_task_directory_count=0) +(committer_task_directory_depth=1) +(committer_task_file_count=1) +(committer_task_file_size=21) +(committer_task_manifest_file_size=18755) +(job_stage_cleanup.max=196) +(job_stage_create_target_dirs.max=2) +(job_stage_load_manifests.max=687) +(job_stage_optional_validate_output.max=66) +(job_stage_rename_files.max=161) +(job_stage_save_success_marker.max=653) +(job_stage_setup.max=571) +(op_create_directories.max=1) +(op_delete.max=113) +(op_delete_dir.max=129) +(op_get_file_status.failures.max=231) +(op_get_file_status.max=61) +(op_list_status.max=300) +(op_load_all_manifests.max=445) +(op_load_manifest.max=436) +(op_mkdirs.max=123) +(op_msync.max=0) +(op_rename.failures.max=266) +(op_rename.max=139) +(task_stage_commit.max=302) +(task_stage_save_task_manifest.max=651) +(task_stage_scan_directory.max=302) +(task_stage_setup.max=157)); + +means=((commit_file_rename.mean=(samples=1, sum=141, mean=141.0000)) +(committer_commit_job.mean=(samples=1, sum=2306, mean=2306.0000)) +(committer_task_directory_count=(samples=4, sum=0, mean=0.0000)) +(committer_task_directory_depth=(samples=2, sum=2, mean=1.0000)) +(committer_task_file_count=(samples=4, sum=2, mean=0.5000)) +(committer_task_file_size=(samples=2, sum=21, mean=10.5000)) +(committer_task_manifest_file_size=(samples=2, sum=37157, mean=18578.5000)) +(job_stage_cleanup.mean=(samples=1, sum=196, mean=196.0000)) +(job_stage_create_target_dirs.mean=(samples=1, sum=2, mean=2.0000)) +(job_stage_load_manifests.mean=(samples=1, sum=687, mean=687.0000)) +(job_stage_optional_validate_output.mean=(samples=1, sum=66, mean=66.0000)) +(job_stage_rename_files.mean=(samples=1, sum=161, mean=161.0000)) +(job_stage_save_success_marker.mean=(samples=1, sum=653, mean=653.0000)) +(job_stage_setup.mean=(samples=1, sum=571, mean=571.0000)) +(op_create_directories.mean=(samples=1, sum=1, mean=1.0000)) +(op_delete.mean=(samples=3, sum=240, mean=80.0000)) +(op_delete_dir.mean=(samples=1, sum=129, mean=129.0000)) +(op_get_file_status.failures.mean=(samples=6, sum=614, mean=102.3333)) +(op_get_file_status.mean=(samples=3, sum=175, mean=58.3333)) +(op_list_status.mean=(samples=3, sum=671, mean=223.6667)) +(op_load_all_manifests.mean=(samples=1, sum=445, mean=445.0000)) +(op_load_manifest.mean=(samples=2, sum=607, mean=303.5000)) +(op_mkdirs.mean=(samples=4, sum=361, mean=90.2500)) +(op_msync.mean=(samples=1, sum=0, mean=0.0000)) +(op_rename.failures.mean=(samples=1, sum=266, mean=266.0000)) +(op_rename.mean=(samples=1, sum=139, mean=139.0000)) +(task_stage_commit.mean=(samples=2, sum=508, mean=254.0000)) +(task_stage_save_task_manifest.mean=(samples=1, sum=651, mean=651.0000)) +(task_stage_scan_directory.mean=(samples=2, sum=508, mean=254.0000)) +(task_stage_setup.mean=(samples=2, sum=284, mean=142.0000))); + +``` + ## Collecting Job Summaries `mapreduce.manifest.committer.summary.report.directory` The committer can be configured to save the `_SUCCESS` summary files to a report directory, @@ -431,46 +559,62 @@ This allows for the statistics of jobs to be collected irrespective of their out saving the `_SUCCESS` marker is enabled, and without problems caused by a chain of queries overwriting the markers. +The `mapred successfile` operation can be used to print these reports. # Cleanup Job cleanup is convoluted as it is designed to address a number of issues which may surface in cloud storage. -* Slow performance for deletion of directories. -* Timeout when deleting very deep and wide directory trees. +* Slow performance for deletion of directories (GCS). +* Timeout when deleting very deep and wide directory trees (Azure). * General resilience to cleanup issues escalating to job failures. -| Option | Meaning | Default Value | -|--------|---------|---------------| -| `mapreduce.fileoutputcommitter.cleanup.skipped` | Skip cleanup of `_temporary` directory| `false` | -| `mapreduce.fileoutputcommitter.cleanup-failures.ignored` | Ignore errors during cleanup | `false` | -| `mapreduce.manifest.committer.cleanup.parallel.delete` | Delete task attempt directories in parallel | `true` | +| Option | Meaning | Default Value | +|-------------------------------------------------------------------|--------------------------------------------------------------------|---------------| +| `mapreduce.fileoutputcommitter.cleanup.skipped` | Skip cleanup of `_temporary` directory | `false` | +| `mapreduce.fileoutputcommitter.cleanup-failures.ignored` | Ignore errors during cleanup | `false` | +| `mapreduce.manifest.committer.cleanup.parallel.delete` | Delete task attempt directories in parallel | `true` | +| `mapreduce.manifest.committer.cleanup.parallel.delete.base.first` | Attempt to delete the base directory before parallel task attempts | `false` | The algorithm is: -``` -if `mapreduce.fileoutputcommitter.cleanup.skipped`: +```python +if "mapreduce.fileoutputcommitter.cleanup.skipped": return -if `mapreduce.manifest.committer.cleanup.parallel.delete`: - attempt parallel delete of task directories; catch any exception -if not `mapreduce.fileoutputcommitter.cleanup.skipped`: - delete(`_temporary`); catch any exception -if caught-exception and not `mapreduce.fileoutputcommitter.cleanup-failures.ignored`: - throw caught-exception +if "mapreduce.manifest.committer.cleanup.parallel.delete": + if "mapreduce.manifest.committer.cleanup.parallel.delete.base.first" : + if delete("_temporary"): + return + delete(list("$task-directories")) catch any exception +if not "mapreduce.fileoutputcommitter.cleanup.skipped": + delete("_temporary"); catch any exception +if caught-exception and not "mapreduce.fileoutputcommitter.cleanup-failures.ignored": + raise caught-exception ``` It's a bit complicated, but the goal is to perform a fast/scalable delete and throw a meaningful exception if that didn't work. -When working with ABFS and GCS, these settings should normally be left alone. -If somehow errors surface during cleanup, enabling the option to -ignore failures will ensure the job still completes. +For ABFS set `mapreduce.manifest.committer.cleanup.parallel.delete.base.first` to `true` +which should normally result in less network IO and a faster cleanup. + +``` +spark.hadoop.mapreduce.manifest.committer.cleanup.parallel.delete.base.first true +``` + +For GCS, setting `mapreduce.manifest.committer.cleanup.parallel.delete.base.first` +to `false` may speed up cleanup. + +If somehow errors surface during cleanup, ignoring failures will ensure the job +is still considered a success. +`mapreduce.fileoutputcommitter.cleanup-failures.ignored = true` + Disabling cleanup even avoids the overhead of cleanup, but requires a workflow or manual operation to clean up all -`_temporary` directories on a regular basis. - +`_temporary` directories on a regular basis: +`mapreduce.fileoutputcommitter.cleanup.skipped = true`. # Working with Azure ADLS Gen2 Storage @@ -504,9 +648,15 @@ The core set of Azure-optimized options becomes - spark.hadoop.fs.azure.io.rate.limit - 10000 + fs.azure.io.rate.limit + 1000 + + + + mapreduce.manifest.committer.cleanup.parallel.delete.base.first + true + ``` And optional settings for debugging/performance analysis @@ -514,7 +664,7 @@ And optional settings for debugging/performance analysis ```xml mapreduce.manifest.committer.summary.report.directory - abfs:// Path within same store/separate store + Path within same store/separate store Optional: path to where job summaries are saved ``` @@ -523,14 +673,15 @@ And optional settings for debugging/performance analysis ``` spark.hadoop.mapreduce.outputcommitter.factory.scheme.abfs org.apache.hadoop.fs.azurebfs.commit.AzureManifestCommitterFactory -spark.hadoop.fs.azure.io.rate.limit 10000 +spark.hadoop.fs.azure.io.rate.limit 1000 +spark.hadoop.mapreduce.manifest.committer.cleanup.parallel.delete.base.first true spark.sql.parquet.output.committer.class org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter spark.sql.sources.commitProtocolClass org.apache.spark.internal.io.cloud.PathOutputCommitProtocol spark.hadoop.mapreduce.manifest.committer.summary.report.directory (optional: URI of a directory for job summaries) ``` -## Experimental: ABFS Rename Rate Limiting `fs.azure.io.rate.limit` +## ABFS Rename Rate Limiting `fs.azure.io.rate.limit` To avoid triggering store throttling and backoff delays, as well as other throttling-related failure conditions file renames during job commit @@ -544,13 +695,12 @@ may issue. Set the option to `0` remove all rate limiting. -The default value of this is set to 10000, which is the default IO capacity for -an ADLS storage account. +The default value of this is set to 1000. ```xml fs.azure.io.rate.limit - 10000 + 1000 maximum number of renames attempted per second ``` @@ -569,7 +719,7 @@ If server-side throttling took place, signs of this can be seen in * The store service's logs and their throttling status codes (usually 503 or 500). * The job statistic `commit_file_rename_recovered`. This statistic indicates that ADLS throttling manifested as failures in renames, failures which were recovered - from in the comitter. + from in the committer. If these are seen -or other applications running at the same time experience throttling/throttling-triggered problems, consider reducing the value of @@ -598,13 +748,14 @@ The Spark settings to switch to this committer are spark.hadoop.mapreduce.outputcommitter.factory.scheme.gs org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterFactory spark.sql.parquet.output.committer.class org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter spark.sql.sources.commitProtocolClass org.apache.spark.internal.io.cloud.PathOutputCommitProtocol - +spark.hadoop.mapreduce.manifest.committer.cleanup.parallel.delete.base.first false spark.hadoop.mapreduce.manifest.committer.summary.report.directory (optional: URI of a directory for job summaries) ``` The store's directory delete operations are `O(files)` so the value of `mapreduce.manifest.committer.cleanup.parallel.delete` -SHOULD be left at the default of `true`. +SHOULD be left at the default of `true`, but +`mapreduce.manifest.committer.cleanup.parallel.delete.base.first` changed to `false` For mapreduce, declare the binding in `core-site.xml`or `mapred-site.xml` ```xml @@ -639,19 +790,33 @@ spark.sql.sources.commitProtocolClass org.apache.spark.internal.io.cloud.PathOut spark.hadoop.mapreduce.manifest.committer.summary.report.directory (optional: URI of a directory for job summaries) ``` -# Advanced Topics - -## Advanced Configuration options +# Advanced Configuration options There are some advanced options which are intended for development and testing, rather than production use. -| Option | Meaning | Default Value | -|--------|----------------------------------------------|---------------| -| `mapreduce.manifest.committer.store.operations.classname` | Classname for Manifest Store Operations | `""` | -| `mapreduce.manifest.committer.validate.output` | Perform output validation? | `false` | -| `mapreduce.manifest.committer.writer.queue.capacity` | Queue capacity for writing intermediate file | `32` | +| Option | Meaning | Default Value | +|-----------------------------------------------------------|-------------------------------------------------------------|---------------| +| `mapreduce.manifest.committer.manifest.save.attempts` | How many attempts should be made to commit a task manifest? | `5` | +| `mapreduce.manifest.committer.store.operations.classname` | Classname for Manifest Store Operations | `""` | +| `mapreduce.manifest.committer.validate.output` | Perform output validation? | `false` | +| `mapreduce.manifest.committer.writer.queue.capacity` | Queue capacity for writing intermediate file | `32` | + +### `mapreduce.manifest.committer.manifest.save.attempts` + +The number of attempts which should be made to save a task attempt manifest, which is done by +1. Writing the file to a temporary file in the job attempt directory. +2. Deleting any existing task manifest +3. Renaming the temporary file to the final filename. +This may fail for unrecoverable reasons (permissions, permanent loss of network, service down,...) or it may be +a transient problem which may not reoccur if another attempt is made to write the data. + +The number of attempts to make is set by `mapreduce.manifest.committer.manifest.save.attempts`; +the sleep time increases with each attempt. + +Consider increasing the default value if task attempts fail to commit their work +and fail to recover from network problems. ### Validating output `mapreduce.manifest.committer.validate.output` @@ -691,6 +856,34 @@ There is no need to alter these values, except when writing new implementations something which is only needed if the store provides extra integration support for the committer. +### `mapreduce.manifest.committer.writer.queue.capacity` + +This is a secondary scale option. +It controls the size of the queue for storing lists of files to rename from +the manifests loaded from the target filesystem, manifests loaded +from a pool of worker threads, and the single thread which saves +the entries from each manifest to an intermediate file in the local filesystem. + +Once the queue is full, all manifest loading threads will block. + +```xml + + mapreduce.manifest.committer.writer.queue.capacity + 32 + +``` + +As the local filesystem is usually much faster to write to than any cloud store, +this queue size should not be a limit on manifest load performance. + +It can help limit the amount of memory consumed during manifest load during +job commit. +The maximum number of loaded manifests will be: + +``` +mapreduce.manifest.committer.writer.queue.capacity + mapreduce.manifest.committer.io.threads +``` + ## Support for concurrent jobs to the same directory It *may* be possible to run multiple jobs targeting the same directory tree. diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/manifest_committer_architecture.md b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/manifest_committer_architecture.md index 55806fb6f5b45..a1d8cb5fc3da8 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/manifest_committer_architecture.md +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/manifest_committer_architecture.md @@ -19,6 +19,7 @@ This document describes the architecture and other implementation/correctness aspects of the [Manifest Committer](manifest_committer.html) The protocol and its correctness are covered in [Manifest Committer Protocol](manifest_committer_protocol.html). + The _Manifest_ committer is a committer for work which provides performance on ABFS for "real world" @@ -278,6 +279,11 @@ The manifest committer assumes that the amount of data being stored in memory is because there is no longer the need to store an etag for every block of every file being committed. +This assumption turned out not to hold for some jobs: +[MAPREDUCE-7435. ManifestCommitter OOM on azure job](https://issues.apache.org/jira/browse/MAPREDUCE-7435) + +The strategy here was to read in all manifests and stream their entries to a local file, as Hadoop +Writable objects -hence with lower marshalling overhead than JSON. #### Duplicate creation of directories in the dest dir diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/AbstractManifestCommitterTest.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/AbstractManifestCommitterTest.java index 5b64d544bc551..57c0c39ed9b7f 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/AbstractManifestCommitterTest.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/AbstractManifestCommitterTest.java @@ -57,6 +57,7 @@ import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest; import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport; import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestStoreOperations; +import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.UnreliableManifestStoreOperations; import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CleanupJobStage; import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SaveTaskManifestStage; import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SetupTaskStage; @@ -167,6 +168,12 @@ public abstract class AbstractManifestCommitterTest private static final int MAX_LEN = 64_000; + /** + * How many attempts to save manifests before giving up. + * Kept small to reduce sleep times and network delays. + */ + public static final int SAVE_ATTEMPTS = 4; + /** * Submitter for tasks; may be null. */ @@ -771,6 +778,9 @@ protected StageConfig createStageConfigForJob( /** * Create the stage config for job or task but don't finalize it. * Uses {@link #TASK_IDS} for job/task ID. + * The store operations is extracted from + * {@link #getStoreOperations()}, which is how fault injection + * can be set up. * @param jobAttemptNumber job attempt number * @param taskIndex task attempt index; -1 for job attempt only. * @param taskAttemptNumber task attempt number @@ -796,6 +806,7 @@ protected StageConfig createStageConfig( .withJobAttemptNumber(jobAttemptNumber) .withJobDirectories(attemptDirs) .withName(String.format(NAME_FORMAT_JOB_ATTEMPT, jobId)) + .withManifestSaveAttempts(SAVE_ATTEMPTS) .withOperations(getStoreOperations()) .withProgressable(getProgressCounter()) .withSuccessMarkerFileLimit(100_000) @@ -924,7 +935,7 @@ protected TaskManifest executeOneTaskAttempt(final int task, } // save the manifest for this stage. - new SaveTaskManifestStage(taskStageConfig).apply(manifest); + new SaveTaskManifestStage(taskStageConfig).apply(() -> manifest); return manifest; } @@ -998,7 +1009,9 @@ protected void assertCleanupResult( * Create and execute a cleanup stage. * @param enabled is the stage enabled? * @param deleteTaskAttemptDirsInParallel delete task attempt dirs in - * parallel? + * parallel? + * @param attemptBaseDeleteFirst Make an initial attempt to + * delete the base directory * @param suppressExceptions suppress exceptions? * @param outcome expected outcome. * @param expectedDirsDeleted #of directories deleted. -1 for no checks @@ -1008,13 +1021,18 @@ protected void assertCleanupResult( protected CleanupJobStage.Result cleanup( final boolean enabled, final boolean deleteTaskAttemptDirsInParallel, + boolean attemptBaseDeleteFirst, final boolean suppressExceptions, final CleanupJobStage.Outcome outcome, final int expectedDirsDeleted) throws IOException { StageConfig stageConfig = getJobStageConfig(); CleanupJobStage.Result result = new CleanupJobStage(stageConfig) .apply(new CleanupJobStage.Arguments(OP_STAGE_JOB_CLEANUP, - enabled, deleteTaskAttemptDirsInParallel, suppressExceptions)); + enabled, + deleteTaskAttemptDirsInParallel, + attemptBaseDeleteFirst, + suppressExceptions, + 0)); assertCleanupResult(result, outcome, expectedDirsDeleted); return result; } @@ -1038,6 +1056,24 @@ protected String readText(final Path path) throws IOException { StandardCharsets.UTF_8); } + /** + * Make the store operations unreliable. + * If it already was then reset the failure options. + * @return the store operations + */ + protected UnreliableManifestStoreOperations makeStoreOperationsUnreliable() { + UnreliableManifestStoreOperations failures; + final ManifestStoreOperations wrappedOperations = getStoreOperations(); + if (wrappedOperations instanceof UnreliableManifestStoreOperations) { + failures = (UnreliableManifestStoreOperations) wrappedOperations; + failures.reset(); + } else { + failures = new UnreliableManifestStoreOperations(wrappedOperations); + setStoreOperations(failures); + } + return failures; + } + /** * Counter. */ diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterTestSupport.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterTestSupport.java index 3b52fe9875641..31620e55239ae 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterTestSupport.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/ManifestCommitterTestSupport.java @@ -38,6 +38,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.statistics.IOStatistics; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.mapreduce.JobID; import org.apache.hadoop.mapreduce.RecordWriter; @@ -314,6 +315,21 @@ static void assertDirEntryMatch( .isEqualTo(type); } + /** + * Assert that none of the named statistics have any failure counts, + * which may be from being null or 0. + * @param iostats statistics + * @param names base name of the statistics (i.e. without ".failures" suffix) + */ + public static void assertNoFailureStatistics(IOStatistics iostats, String... names) { + final Map counters = iostats.counters(); + for (String name : names) { + Assertions.assertThat(counters.get(name + ".failures")) + .describedAs("Failure count of %s", name) + .matches(f -> f == null || f == 0); + } + } + /** * Save a manifest to an entry file; returning the loaded manifest data. * Caller MUST clean up the temp file. diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestCleanupStage.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestCleanupStage.java index 8d551c505209c..c8c766a43cff3 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestCleanupStage.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestCleanupStage.java @@ -80,17 +80,25 @@ public void setup() throws Exception { @Test public void testCleanupInParallelHealthy() throws Throwable { describe("parallel cleanup of TA dirs."); - cleanup(true, true, false, + cleanup(true, true, false, false, CleanupJobStage.Outcome.PARALLEL_DELETE, PARALLEL_DELETE_COUNT); verifyJobDirsCleanedUp(); } + @Test + public void testCleanupInParallelHealthyBaseFirst() throws Throwable { + describe("parallel cleanup of TA dirs with base first: one operation"); + cleanup(true, true, true, false, + CleanupJobStage.Outcome.DELETED, ROOT_DELETE_COUNT); + verifyJobDirsCleanedUp(); + } + @Test public void testCleanupSingletonHealthy() throws Throwable { describe("Cleanup with a single delete. Not the default; would be best on HDFS"); - cleanup(true, false, false, + cleanup(true, false, false, false, CleanupJobStage.Outcome.DELETED, ROOT_DELETE_COUNT); verifyJobDirsCleanedUp(); } @@ -99,31 +107,69 @@ public void testCleanupSingletonHealthy() throws Throwable { public void testCleanupNoDir() throws Throwable { describe("parallel cleanup MUST not fail if there's no dir"); // first do the cleanup - cleanup(true, true, false, + cleanup(true, true, false, false, CleanupJobStage.Outcome.PARALLEL_DELETE, PARALLEL_DELETE_COUNT); // now expect cleanup by single delete still works // the delete count is 0 as pre check skips it - cleanup(true, false, false, + cleanup(true, false, false, false, + CleanupJobStage.Outcome.NOTHING_TO_CLEAN_UP, 0); + cleanup(true, true, true, false, CleanupJobStage.Outcome.NOTHING_TO_CLEAN_UP, 0); // if skipped, that happens first - cleanup(false, true, false, + cleanup(false, true, false, false, CleanupJobStage.Outcome.DISABLED, 0); } @Test public void testFailureInParallelDelete() throws Throwable { - describe("A parallel delete fails, but the base delete works"); + describe("A parallel delete fails, but the fallback base delete works"); // pick one of the manifests TaskManifest manifest = manifests.get(4); - Path taPath = new Path(manifest.getTaskAttemptDir()); - failures.addDeletePathToFail(taPath); - cleanup(true, true, false, + failures.addDeletePathToFail(new Path(manifest.getTaskAttemptDir())); + cleanup(true, true, false, false, CleanupJobStage.Outcome.DELETED, PARALLEL_DELETE_COUNT); } + @Test + public void testFailureInParallelBaseDelete() throws Throwable { + describe("A parallel delete fails in the base delete; the parallel stage works"); + + // base path will timeout on first delete; the parallel delete will take place + failures.addDeletePathToTimeOut(getJobStageConfig().getOutputTempSubDir()); + failures.setFailureLimit(1); + cleanup(true, true, false, false, + CleanupJobStage.Outcome.PARALLEL_DELETE, PARALLEL_DELETE_COUNT); + } + + @Test + public void testDoubleFailureInParallelBaseDelete() throws Throwable { + describe("A parallel delete fails with the base delete and a task attempt dir"); + + // base path will timeout on first delete; the parallel delete will take place + failures.addDeletePathToTimeOut(getJobStageConfig().getOutputTempSubDir()); + TaskManifest manifest = manifests.get(4); + failures.addDeletePathToFail(new Path(manifest.getTaskAttemptDir())); + failures.setFailureLimit(2); + cleanup(true, true, true, false, + CleanupJobStage.Outcome.DELETED, PARALLEL_DELETE_COUNT + 1); + } + + @Test + public void testTripleFailureInParallelBaseDelete() throws Throwable { + describe("All delete phases will fail"); + + // base path will timeout on first delete; the parallel delete will take place + failures.addDeletePathToTimeOut(getJobStageConfig().getOutputTempSubDir()); + TaskManifest manifest = manifests.get(4); + failures.addDeletePathToFail(new Path(manifest.getTaskAttemptDir())); + failures.setFailureLimit(4); + cleanup(true, true, true, true, + CleanupJobStage.Outcome.FAILURE, PARALLEL_DELETE_COUNT + 1); + } + /** * If there's no job task attempt subdir then the list of it will raise * and FNFE; this MUST be caught and the base delete executed. @@ -135,7 +181,7 @@ public void testParallelDeleteNoTaskAttemptDir() throws Throwable { StageConfig stageConfig = getJobStageConfig(); // TA dir doesn't exist, so listing will fail. failures.addPathNotFound(stageConfig.getJobAttemptTaskSubDir()); - cleanup(true, true, false, + cleanup(true, true, false, false, CleanupJobStage.Outcome.DELETED, ROOT_DELETE_COUNT); } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestCommitTaskStage.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestCommitTaskStage.java index 4f4162d46cb9f..95de9a32eecd1 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestCommitTaskStage.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestCommitTaskStage.java @@ -19,13 +19,21 @@ package org.apache.hadoop.mapreduce.lib.output.committer.manifest; import java.io.FileNotFoundException; +import java.net.SocketTimeoutException; import org.assertj.core.api.Assertions; import org.junit.Test; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathIOException; +import org.apache.hadoop.fs.contract.ContractTestUtils; +import org.apache.hadoop.fs.statistics.IOStatisticsSnapshot; +import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore; import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.ManifestSuccessData; import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest; +import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestStoreOperations; +import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.UnreliableManifestStoreOperations; import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CleanupJobStage; import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CommitJobStage; import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.CommitTaskStage; @@ -33,14 +41,27 @@ import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.SetupTaskStage; import org.apache.hadoop.mapreduce.lib.output.committer.manifest.stages.StageConfig; +import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.assertThatStatisticCounter; +import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsToPrettyString; +import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_SAVE_TASK_MANIFEST; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_JOB_CLEANUP; +import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.manifestPathForTask; +import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.manifestTempPathForTaskAttempt; +import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.UnreliableManifestStoreOperations.E_TIMEOUT; +import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.UnreliableManifestStoreOperations.generatedErrorMessage; import static org.apache.hadoop.test.LambdaTestUtils.intercept; /** - * Test committing a task. + * Test committing a task, with lots of fault injection to validate + * resilience to transient failures. */ public class TestCommitTaskStage extends AbstractManifestCommitterTest { + public static final String TASK1 = String.format("task_%03d", 1); + + public static final String TASK1_ATTEMPT1 = String.format("%s_%02d", + TASK1, 1); + @Override public void setup() throws Exception { super.setup(); @@ -51,6 +72,15 @@ public void setup() throws Exception { new SetupJobStage(stageConfig).apply(true); } + + /** + * Create a stage config for job 1 task1 attempt 1. + * @return a task stage configuration. + */ + private StageConfig createStageConfig() { + return createTaskStageConfig(JOB1, TASK1, TASK1_ATTEMPT1); + } + @Test public void testCommitMissingDirectory() throws Throwable { @@ -108,8 +138,9 @@ public void testCommitEmptyDirectory() throws Throwable { OP_STAGE_JOB_CLEANUP, true, true, - false - ))); + false, + false, + 0))); // review success file final Path successPath = outcome.getSuccessPath(); @@ -123,4 +154,283 @@ public void testCommitEmptyDirectory() throws Throwable { .isEmpty(); } + + @Test + public void testManifestSaveFailures() throws Throwable { + describe("Test recovery of manifest save/rename failures"); + + UnreliableManifestStoreOperations failures = makeStoreOperationsUnreliable(); + + StageConfig stageConfig = createStageConfig(); + + new SetupTaskStage(stageConfig).apply("setup"); + + final Path manifestDir = stageConfig.getTaskManifestDir(); + // final manifest file is by task ID + Path manifestFile = manifestPathForTask(manifestDir, + stageConfig.getTaskId()); + Path manifestTempFile = manifestTempPathForTaskAttempt(manifestDir, + stageConfig.getTaskAttemptId()); + + // manifest save will fail but recover before the task gives up. + failures.addSaveToFail(manifestTempFile); + + // will fail because too many attempts failed. + failures.setFailureLimit(SAVE_ATTEMPTS + 1); + intercept(PathIOException.class, generatedErrorMessage("save"), () -> + new CommitTaskStage(stageConfig).apply(null)); + + // will succeed because the failure limit is set lower + failures.setFailureLimit(SAVE_ATTEMPTS - 1); + new CommitTaskStage(stageConfig).apply(null); + + describe("Testing timeouts on rename operations."); + // now do it for the renames, which will fail after the rename + failures.reset(); + failures.addTimeoutBeforeRename(manifestTempFile); + + // first verify that if too many attempts fail, the task will fail + failures.setFailureLimit(SAVE_ATTEMPTS + 1); + intercept(SocketTimeoutException.class, E_TIMEOUT, () -> + new CommitTaskStage(stageConfig).apply(null)); + + // reduce the limit and expect the stage to succeed. + failures.setFailureLimit(SAVE_ATTEMPTS - 1); + new CommitTaskStage(stageConfig).apply(null); + } + + /** + * Save with renaming failing before the rename; the source file + * will be present on the next attempt. + * The successfully saved manifest file is loaded and its statistics + * examined to verify that the failure count is updated. + */ + @Test + public void testManifestRenameEarlyTimeouts() throws Throwable { + describe("Testing timeouts on rename operations."); + + UnreliableManifestStoreOperations failures = makeStoreOperationsUnreliable(); + StageConfig stageConfig = createStageConfig(); + + new SetupTaskStage(stageConfig).apply("setup"); + + final Path manifestDir = stageConfig.getTaskManifestDir(); + // final manifest file is by task ID + Path manifestFile = manifestPathForTask(manifestDir, + stageConfig.getTaskId()); + Path manifestTempFile = manifestTempPathForTaskAttempt(manifestDir, + stageConfig.getTaskAttemptId()); + + + // configure for which will fail after the rename + failures.addTimeoutBeforeRename(manifestTempFile); + + // first verify that if too many attempts fail, the task will fail + failures.setFailureLimit(SAVE_ATTEMPTS + 1); + intercept(SocketTimeoutException.class, E_TIMEOUT, () -> + new CommitTaskStage(stageConfig).apply(null)); + // and that the IO stats are updated + final IOStatisticsStore iostats = stageConfig.getIOStatistics(); + assertThatStatisticCounter(iostats, OP_SAVE_TASK_MANIFEST + ".failures") + .isEqualTo(SAVE_ATTEMPTS); + + // reduce the limit and expect the stage to succeed. + iostats.reset(); + failures.setFailureLimit(SAVE_ATTEMPTS); + final CommitTaskStage.Result r = new CommitTaskStage(stageConfig).apply(null); + + // load in the manifest + final TaskManifest loadedManifest = TaskManifest.load(getFileSystem(), r.getPath()); + final IOStatisticsSnapshot loadedIOStats = loadedManifest.getIOStatistics(); + LOG.info("Statistics of file successfully saved:\nD {}", + ioStatisticsToPrettyString(loadedIOStats)); + assertThatStatisticCounter(loadedIOStats, OP_SAVE_TASK_MANIFEST + ".failures") + .isEqualTo(SAVE_ATTEMPTS - 1); + } + + @Test + public void testManifestRenameLateTimeoutsFailure() throws Throwable { + describe("Testing timeouts on rename operations."); + + UnreliableManifestStoreOperations failures = makeStoreOperationsUnreliable(); + StageConfig stageConfig = createStageConfig(); + + new SetupTaskStage(stageConfig).apply("setup"); + + final Path manifestDir = stageConfig.getTaskManifestDir(); + + Path manifestTempFile = manifestTempPathForTaskAttempt(manifestDir, + stageConfig.getTaskAttemptId()); + + failures.addTimeoutAfterRename(manifestTempFile); + + // if too many attempts fail, the task will fail + failures.setFailureLimit(SAVE_ATTEMPTS + 1); + intercept(SocketTimeoutException.class, E_TIMEOUT, () -> + new CommitTaskStage(stageConfig).apply(null)); + + } + + @Test + public void testManifestRenameLateTimeoutsRecovery() throws Throwable { + describe("Testing recovery from late timeouts on rename operations."); + + UnreliableManifestStoreOperations failures = makeStoreOperationsUnreliable(); + StageConfig stageConfig = createStageConfig(); + + new SetupTaskStage(stageConfig).apply("setup"); + + final Path manifestDir = stageConfig.getTaskManifestDir(); + + Path manifestTempFile = manifestTempPathForTaskAttempt(manifestDir, + stageConfig.getTaskAttemptId()); + + failures.addTimeoutAfterRename(manifestTempFile); + + // reduce the limit and expect the stage to succeed. + failures.setFailureLimit(SAVE_ATTEMPTS); + stageConfig.getIOStatistics().reset(); + new CommitTaskStage(stageConfig).apply(null); + final CommitTaskStage.Result r = new CommitTaskStage(stageConfig).apply(null); + + // load in the manifest + final TaskManifest loadedManifest = TaskManifest.load(getFileSystem(), r.getPath()); + final IOStatisticsSnapshot loadedIOStats = loadedManifest.getIOStatistics(); + LOG.info("Statistics of file successfully saved:\n{}", + ioStatisticsToPrettyString(loadedIOStats)); + // the failure event is one less than the limit. + assertThatStatisticCounter(loadedIOStats, OP_SAVE_TASK_MANIFEST + ".failures") + .isEqualTo(SAVE_ATTEMPTS - 1); + } + + @Test + public void testFailureToDeleteManifestPath() throws Throwable { + describe("Testing failure in the delete call made before renaming the manifest"); + + UnreliableManifestStoreOperations failures = makeStoreOperationsUnreliable(); + StageConfig stageConfig = createStageConfig(); + + new SetupTaskStage(stageConfig).apply("setup"); + + final Path manifestDir = stageConfig.getTaskManifestDir(); + // final manifest file is by task ID + Path manifestFile = manifestPathForTask(manifestDir, + stageConfig.getTaskId()); + // put a file in as there is a check for it before the delete + ContractTestUtils.touch(getFileSystem(), manifestFile); + /* and the delete shall fail */ + failures.addDeletePathToFail(manifestFile); + Path manifestTempFile = manifestTempPathForTaskAttempt(manifestDir, + stageConfig.getTaskAttemptId()); + + + // first verify that if too many attempts fail, the task will fail + failures.setFailureLimit(SAVE_ATTEMPTS + 1); + intercept(PathIOException.class, () -> + new CommitTaskStage(stageConfig).apply(null)); + + // reduce the limit and expect the stage to succeed. + failures.setFailureLimit(SAVE_ATTEMPTS - 1); + new CommitTaskStage(stageConfig).apply(null); + } + + + /** + * Failure of delete before saving the manifest to a temporary path. + */ + @Test + public void testFailureOfDeleteBeforeSavingTemporaryFile() throws Throwable { + describe("Testing failure in the delete call made before rename"); + + UnreliableManifestStoreOperations failures = makeStoreOperationsUnreliable(); + StageConfig stageConfig = createStageConfig(); + + new SetupTaskStage(stageConfig).apply("setup"); + + final Path manifestDir = stageConfig.getTaskManifestDir(); + Path manifestTempFile = manifestTempPathForTaskAttempt(manifestDir, + stageConfig.getTaskAttemptId()); + + // delete will fail + failures.addDeletePathToFail(manifestTempFile); + + // first verify that if too many attempts fail, the task will fail + failures.setFailureLimit(SAVE_ATTEMPTS + 1); + intercept(PathIOException.class, () -> + new CommitTaskStage(stageConfig).apply(null)); + + // reduce the limit and expect the stage to succeed. + failures.setFailureLimit(SAVE_ATTEMPTS - 1); + new CommitTaskStage(stageConfig).apply(null); + + } + /** + * Rename target is a directory. + */ + @Test + public void testRenameTargetIsDir() throws Throwable { + describe("Rename target is a directory"); + + final ManifestStoreOperations operations = getStoreOperations(); + StageConfig stageConfig = createStageConfig(); + + final SetupTaskStage setup = new SetupTaskStage(stageConfig); + setup.apply("setup"); + + final Path manifestDir = stageConfig.getTaskManifestDir(); + // final manifest file is by task ID + Path manifestFile = manifestPathForTask(manifestDir, + stageConfig.getTaskId()); + Path manifestTempFile = manifestTempPathForTaskAttempt(manifestDir, + stageConfig.getTaskAttemptId()); + + // add a directory where the manifest file is to go + setup.mkdirs(manifestFile, true); + ContractTestUtils.assertIsDirectory(getFileSystem(), manifestFile); + new CommitTaskStage(stageConfig).apply(null); + + // this must be a file. + final FileStatus st = operations.getFileStatus(manifestFile); + Assertions.assertThat(st) + .describedAs("File status of %s", manifestFile) + .matches(FileStatus::isFile, "is a file"); + + // and it must load. + final TaskManifest manifest = setup.loadManifest(st); + Assertions.assertThat(manifest) + .matches(m -> m.getTaskID().equals(TASK1)) + .matches(m -> m.getTaskAttemptID().equals(TASK1_ATTEMPT1)); + } + + /** + * Manifest temp file path is a directory. + */ + @Test + public void testManifestTempFileIsDir() throws Throwable { + describe("Manifest temp file path is a directory"); + + final ManifestStoreOperations operations = getStoreOperations(); + StageConfig stageConfig = createStageConfig(); + + final SetupTaskStage setup = new SetupTaskStage(stageConfig); + setup.apply("setup"); + + final Path manifestDir = stageConfig.getTaskManifestDir(); + // final manifest file is by task ID + Path manifestFile = manifestPathForTask(manifestDir, + stageConfig.getTaskId()); + Path manifestTempFile = manifestTempPathForTaskAttempt(manifestDir, + stageConfig.getTaskAttemptId()); + + // add a directory where the manifest file is to go + setup.mkdirs(manifestTempFile, true); + new CommitTaskStage(stageConfig).apply(null); + + final TaskManifest manifest = setup.loadManifest( + operations.getFileStatus(manifestFile)); + Assertions.assertThat(manifest) + .matches(m -> m.getTaskID().equals(TASK1)) + .matches(m -> m.getTaskAttemptID().equals(TASK1_ATTEMPT1)); + } + } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestCreateOutputDirectoriesStage.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestCreateOutputDirectoriesStage.java index c471ef11a88d4..b2d3c3f84a6bd 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestCreateOutputDirectoriesStage.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestCreateOutputDirectoriesStage.java @@ -247,7 +247,7 @@ public void testPrepareDirtyTree() throws Throwable { CreateOutputDirectoriesStage attempt2 = new CreateOutputDirectoriesStage( createStageConfigForJob(JOB1, destDir) - .withDeleteTargetPaths(true)); + .withDeleteTargetPaths(false)); // attempt will fail because one of the entries marked as // a file to delete is now a non-empty directory LOG.info("Executing failing attempt to create the directories"); diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestJobThroughManifestCommitter.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestJobThroughManifestCommitter.java index 4bc2ce9bcf648..152b2c86e0f9c 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestJobThroughManifestCommitter.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestJobThroughManifestCommitter.java @@ -598,7 +598,8 @@ public void test_0450_validationDetectsFailures() throws Throwable { public void test_0900_cleanupJob() throws Throwable { describe("Cleanup job"); CleanupJobStage.Arguments arguments = new CleanupJobStage.Arguments( - OP_STAGE_JOB_CLEANUP, true, true, false); + OP_STAGE_JOB_CLEANUP, true, true, + false, false, 0); // the first run will list the three task attempt dirs and delete each // one before the toplevel dir. CleanupJobStage.Result result = new CleanupJobStage( @@ -615,7 +616,7 @@ public void test_0900_cleanupJob() throws Throwable { * Needed to clean up the shared test root, as test case teardown * does not do it. */ - //@Test + @Test public void test_9999_cleanupTestDir() throws Throwable { if (shouldDeleteTestRootAtEndOfTestRun()) { deleteSharedTestRoot(); diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestLoadManifestsStage.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestLoadManifestsStage.java index 4dd7fe2dbcea5..ce20e02457a89 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestLoadManifestsStage.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/TestLoadManifestsStage.java @@ -176,7 +176,7 @@ public void testSaveThenLoadManyManifests() throws Throwable { // and skipping the rename stage (which is going to fail), // go straight to cleanup new CleanupJobStage(stageConfig).apply( - new CleanupJobStage.Arguments("", true, true, false)); + new CleanupJobStage.Arguments("", true, true, false, false, 0)); heapinfo(heapInfo, "cleanup"); ManifestSuccessData success = createManifestOutcome(stageConfig, OP_STAGE_JOB_COMMIT); diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/UnreliableManifestStoreOperations.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/UnreliableManifestStoreOperations.java index 811fc704a2a33..61a6ce1421e38 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/UnreliableManifestStoreOperations.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/UnreliableManifestStoreOperations.java @@ -21,8 +21,10 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.InterruptedIOException; +import java.net.SocketTimeoutException; import java.util.HashSet; import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -47,8 +49,7 @@ * This is for testing. It could be implemented via * Mockito 2 spy code but is not so that: * 1. It can be backported to Hadoop versions using Mockito 1.x. - * 2. It can be extended to use in production. This is why it is in - * the production module -to allow for downstream tests to adopt it. + * 2. It can be extended to use in production. * 3. You can actually debug what's going on. */ @InterfaceAudience.Private @@ -69,6 +70,12 @@ public class UnreliableManifestStoreOperations extends ManifestStoreOperations { */ public static final String SIMULATED_FAILURE = "Simulated failure"; + /** + * Default failure limit. + * Set to a large enough value that most tests don't hit it. + */ + private static final int DEFAULT_FAILURE_LIMIT = Integer.MAX_VALUE; + /** * Underlying store operations to wrap. */ @@ -110,6 +117,16 @@ public class UnreliableManifestStoreOperations extends ManifestStoreOperations { */ private final Set renameDestDirsToFail = new HashSet<>(); + /** + * Source paths of rename operations to time out before the rename request is issued. + */ + private final Set renamePathsToTimeoutBeforeRename = new HashSet<>(); + + /** + * Source paths of rename operations to time out after the rename request has succeeded. + */ + private final Set renamePathsToTimeoutAfterRename = new HashSet<>(); + /** * Path of save() to fail. */ @@ -125,6 +142,11 @@ public class UnreliableManifestStoreOperations extends ManifestStoreOperations { */ private boolean renameToFailWithException = true; + /** + * How many failures before an operation is passed through. + */ + private final AtomicInteger failureLimit = new AtomicInteger(DEFAULT_FAILURE_LIMIT); + /** * Constructor. * @param wrappedOperations operations to wrap. @@ -133,16 +155,19 @@ public UnreliableManifestStoreOperations(final ManifestStoreOperations wrappedOp this.wrappedOperations = wrappedOperations; } - /** * Reset everything. */ public void reset() { deletePathsToFail.clear(); deletePathsToTimeOut.clear(); + failureLimit.set(DEFAULT_FAILURE_LIMIT); pathNotFound.clear(); renameSourceFilesToFail.clear(); renameDestDirsToFail.clear(); + renamePathsToTimeoutBeforeRename.clear(); + renamePathsToTimeoutAfterRename.clear(); + saveToFail.clear(); timeoutSleepTimeMillis = 0; } @@ -219,6 +244,21 @@ public void addRenameDestDirsFail(Path path) { renameDestDirsToFail.add(requireNonNull(path)); } + /** + * Add a source path to timeout before the rename. + * @param path path to add. + */ + public void addTimeoutBeforeRename(Path path) { + renamePathsToTimeoutBeforeRename.add(requireNonNull(path)); + } + /** + * Add a source path to timeout after the rename. + * @param path path to add. + */ + public void addTimeoutAfterRename(Path path) { + renamePathsToTimeoutAfterRename.add(requireNonNull(path)); + } + /** * Add a path to the list of paths where save will fail. * @param path path to add. @@ -228,7 +268,16 @@ public void addSaveToFail(Path path) { } /** - * Raise an exception if the path is in the set of target paths. + * Set the failure limit. + * @param limit limit + */ + public void setFailureLimit(int limit) { + failureLimit.set(limit); + } + + /** + * Raise an exception if the path is in the set of target paths + * and the failure limit is not exceeded. * @param operation operation which failed. * @param path path to check * @param paths paths to probe for {@code path} being in. @@ -236,20 +285,56 @@ public void addSaveToFail(Path path) { */ private void maybeRaiseIOE(String operation, Path path, Set paths) throws IOException { + if (paths.contains(path) && decrementAndCheckFailureLimit()) { + // hand off to the inner check. + maybeRaiseIOENoFailureLimitCheck(operation, path, paths); + } + } + + /** + * Raise an exception if the path is in the set of target paths. + * No checks on failure count are performed. + * @param operation operation which failed. + * @param path path to check + * @param paths paths to probe for {@code path} being in. + * @throws IOException simulated failure + */ + private void maybeRaiseIOENoFailureLimitCheck(String operation, Path path, Set paths) + throws IOException { if (paths.contains(path)) { LOG.info("Simulating failure of {} with {}", operation, path); throw new PathIOException(path.toString(), - SIMULATED_FAILURE + " of " + operation); + generatedErrorMessage(operation)); } } + /** + * Given an operation, return the error message which is used for the simulated + * {@link PathIOException}. + * @param operation operation name + * @return error text + */ + public static String generatedErrorMessage(final String operation) { + return SIMULATED_FAILURE + " of " + operation; + } + + /** + * Check if the failure limit is exceeded. + * Call this after any other trigger checks, as it decrements the counter. + * + * @return true if the limit is not exceeded. + */ + private boolean decrementAndCheckFailureLimit() { + return failureLimit.decrementAndGet() > 0; + } + /** * Verify that a path is not on the file not found list. * @param path path * @throws FileNotFoundException if configured to fail. */ private void verifyExists(Path path) throws FileNotFoundException { - if (pathNotFound.contains(path)) { + if (pathNotFound.contains(path) && decrementAndCheckFailureLimit()) { throw new FileNotFoundException(path.toString()); } } @@ -260,11 +345,12 @@ private void verifyExists(Path path) throws FileNotFoundException { * @param operation operation which failed. * @param path path to check * @param paths paths to probe for {@code path} being in. - * @throws IOException simulated timeout + * @throws SocketTimeoutException simulated timeout + * @throws InterruptedIOException if the sleep is interrupted. */ private void maybeTimeout(String operation, Path path, Set paths) - throws IOException { - if (paths.contains(path)) { + throws SocketTimeoutException, InterruptedIOException { + if (paths.contains(path) && decrementAndCheckFailureLimit()) { LOG.info("Simulating timeout of {} with {}", operation, path); try { if (timeoutSleepTimeMillis > 0) { @@ -273,14 +359,16 @@ private void maybeTimeout(String operation, Path path, Set paths) } catch (InterruptedException e) { throw new InterruptedIOException(e.toString()); } - throw new PathIOException(path.toString(), - "ErrorCode=" + OPERATION_TIMED_OUT + throw new SocketTimeoutException( + path.toString() + ": " + operation + + " ErrorCode=" + OPERATION_TIMED_OUT + " ErrorMessage=" + E_TIMEOUT); } } @Override public FileStatus getFileStatus(final Path path) throws IOException { + maybeTimeout("getFileStatus()", path, pathNotFound); verifyExists(path); return wrappedOperations.getFileStatus(path); } @@ -304,17 +392,23 @@ public boolean mkdirs(final Path path) throws IOException { public boolean renameFile(final Path source, final Path dest) throws IOException { String op = "rename"; + maybeTimeout(op, source, renamePathsToTimeoutBeforeRename); if (renameToFailWithException) { maybeRaiseIOE(op, source, renameSourceFilesToFail); maybeRaiseIOE(op, dest.getParent(), renameDestDirsToFail); } else { - if (renameSourceFilesToFail.contains(source) - || renameDestDirsToFail.contains(dest.getParent())) { + // logic to determine whether rename should just return false. + if ((renameSourceFilesToFail.contains(source) + || renameDestDirsToFail.contains(dest.getParent()) + && decrementAndCheckFailureLimit())) { LOG.info("Failing rename({}, {})", source, dest); return false; } } - return wrappedOperations.renameFile(source, dest); + final boolean b = wrappedOperations.renameFile(source, dest); + // post rename timeout. + maybeTimeout(op, source, renamePathsToTimeoutAfterRename); + return b; } @Override @@ -358,13 +452,19 @@ public boolean storeSupportsResilientCommit() { @Override public CommitFileResult commitFile(final FileEntry entry) throws IOException { + final String op = "commitFile"; + final Path source = entry.getSourcePath(); + maybeTimeout(op, source, renamePathsToTimeoutBeforeRename); if (renameToFailWithException) { - maybeRaiseIOE("commitFile", - entry.getSourcePath(), renameSourceFilesToFail); - maybeRaiseIOE("commitFile", + maybeRaiseIOE(op, + source, renameSourceFilesToFail); + maybeRaiseIOE(op, entry.getDestPath().getParent(), renameDestDirsToFail); } - return wrappedOperations.commitFile(entry); + final CommitFileResult result = wrappedOperations.commitFile(entry); + // post rename timeout. + maybeTimeout(op, source, renamePathsToTimeoutAfterRename); + return result; } @Override diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/log4j.properties b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/log4j.properties index 81a3f6ad5d248..ba3ce740caf05 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/log4j.properties +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/log4j.properties @@ -17,3 +17,5 @@ log4j.threshold=ALL log4j.appender.stdout=org.apache.log4j.ConsoleAppender log4j.appender.stdout.layout=org.apache.log4j.PatternLayout log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} %-5p [%t] %c{2} (%F:%M(%L)) - %m%n + +log4j.logger.org.apache.hadoop.mapreduce.lib.output.committer.manifest=DEBUG \ No newline at end of file diff --git a/hadoop-project/pom.xml b/hadoop-project/pom.xml index 9c812554c4d09..58576b4287992 100644 --- a/hadoop-project/pom.xml +++ b/hadoop-project/pom.xml @@ -111,7 +111,7 @@ 27.0-jre 4.2.3 - 1.77 + 1.78 2.0.0.AM26 @@ -135,7 +135,7 @@ 3.8.2 1.1.1 4.0.3 - 10.14.2.0 + 10.17.1.0 6.2.1.jre7 4.11.0 1.6.20 @@ -186,7 +186,7 @@ 1.3.1 1.0-beta-1 900 - 1.12.599 + 1.12.720 2.24.6 1.0.1 2.7.1 @@ -1959,6 +1959,11 @@ kerb-core ${kerby.version} + + org.apache.kerby + kerb-util + ${kerby.version} + org.ehcache ehcache diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/AWSHeaders.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/AWSHeaders.java index e0d6fa5aecc0b..aaca3b9b194d6 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/AWSHeaders.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/AWSHeaders.java @@ -55,6 +55,9 @@ public interface AWSHeaders { /** Header for optional server-side encryption algorithm. */ String SERVER_SIDE_ENCRYPTION = "x-amz-server-side-encryption"; + /** Header for optional server-side encryption algorithm. */ + String SERVER_SIDE_ENCRYPTION_AWS_KMS_KEY_ID = "x-amz-server-side-encryption-aws-kms-key-id"; + /** Range header for the get object request. */ String RANGE = "Range"; diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/HeaderProcessing.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/HeaderProcessing.java index d42dda59caa5f..3865c391d6ddb 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/HeaderProcessing.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/HeaderProcessing.java @@ -47,6 +47,7 @@ import static org.apache.hadoop.fs.s3a.Statistic.INVOCATION_XATTR_GET_NAMED; import static org.apache.hadoop.fs.s3a.Statistic.INVOCATION_XATTR_GET_NAMED_MAP; import static org.apache.hadoop.fs.s3a.commit.CommitConstants.X_HEADER_MAGIC_MARKER; +import static org.apache.hadoop.fs.s3a.impl.AWSHeaders.SERVER_SIDE_ENCRYPTION_AWS_KMS_KEY_ID; import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDuration; /** @@ -185,6 +186,9 @@ public class HeaderProcessing extends AbstractStoreOperation { public static final String XA_SERVER_SIDE_ENCRYPTION = XA_HEADER_PREFIX + AWSHeaders.SERVER_SIDE_ENCRYPTION; + public static final String XA_ENCRYPTION_KEY_ID = + XA_HEADER_PREFIX + SERVER_SIDE_ENCRYPTION_AWS_KMS_KEY_ID; + /** * Storage Class XAttr: {@value}. */ @@ -363,6 +367,8 @@ private Map retrieveHeaders( md.versionId()); maybeSetHeader(headers, XA_SERVER_SIDE_ENCRYPTION, md.serverSideEncryptionAsString()); + maybeSetHeader(headers, XA_ENCRYPTION_KEY_ID, + md.ssekmsKeyId()); maybeSetHeader(headers, XA_STORAGE_CLASS, md.storageClassAsString()); diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/committers.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/committers.md index 895815444932c..2dff5b799149a 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/committers.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/committers.md @@ -720,7 +720,7 @@ such use case, The "MAGIC PATH" for each job is unique of the format `__magic_jo multiple job running simultaneously do not step into each other. Before attempting this, the committers must be set to not delete all incomplete uploads on job commit, -by setting `fs.s3a.committer.abort.pending.uploads` to `false`. This is set to `false`by default +by setting `fs.s3a.committer.abort.pending.uploads` to `false`. This is set to `true` by default. ```xml diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/EncryptionTestUtils.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/EncryptionTestUtils.java index 7b2b1c639e3cc..42c8de996bac0 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/EncryptionTestUtils.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/EncryptionTestUtils.java @@ -19,7 +19,11 @@ package org.apache.hadoop.fs.s3a; import java.io.IOException; +import java.util.Map; +import java.util.Optional; +import org.apache.hadoop.fs.s3a.impl.HeaderProcessing; +import org.assertj.core.api.Assertions; import software.amazon.awssdk.services.s3.model.HeadObjectResponse; import org.apache.commons.codec.digest.DigestUtils; @@ -28,6 +32,8 @@ import org.apache.hadoop.fs.Path; import static org.apache.hadoop.fs.s3a.Constants.S3_ENCRYPTION_KEY; +import static org.apache.hadoop.fs.s3a.impl.HeaderProcessing.XA_ENCRYPTION_KEY_ID; +import static org.apache.hadoop.fs.s3a.impl.HeaderProcessing.XA_SERVER_SIDE_ENCRYPTION; import static org.assertj.core.api.Assertions.assertThat; public final class EncryptionTestUtils { @@ -111,4 +117,31 @@ public static void assertEncrypted(S3AFileSystem fs, } } + /** + * Assert that a path is encrypted with right encryption settings. + * @param fs filesystem. + * @param path path + * @param algorithm encryption algorithm. + * @param kmsKey full kms key if present. + * @throws IOException any IOE. + */ + public static void validateEncryptionFileAttributes(S3AFileSystem fs, + Path path, + String algorithm, + Optional kmsKey) throws IOException { + Map xAttrs = fs.getXAttrs(path); + Assertions.assertThat(xAttrs.get(XA_SERVER_SIDE_ENCRYPTION)) + .describedAs("Server side encryption must not be null") + .isNotNull(); + Assertions.assertThat(HeaderProcessing.decodeBytes(xAttrs.get(XA_SERVER_SIDE_ENCRYPTION))) + .describedAs("Server side encryption algorithm must match") + .isEqualTo(algorithm); + Assertions.assertThat(xAttrs) + .describedAs("Encryption key id should be present") + .containsKey(XA_ENCRYPTION_KEY_ID); + kmsKey.ifPresent(s -> Assertions + .assertThat(HeaderProcessing.decodeBytes(xAttrs.get(XA_ENCRYPTION_KEY_ID))) + .describedAs("Encryption key id should match with the kms key") + .isEqualTo(s)); + } } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAWSCredentialsProvider.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAWSCredentialsProvider.java index 87a60c8c38556..e6f258e556417 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAWSCredentialsProvider.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAWSCredentialsProvider.java @@ -44,6 +44,7 @@ import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.DELEGATION_TOKEN_BINDING; import static org.apache.hadoop.fs.s3a.impl.InstantiationIOException.CONSTRUCTOR_EXCEPTION; import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.getExternalData; +import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.isUsingDefaultExternalDataFile; import static org.apache.hadoop.test.LambdaTestUtils.intercept; import static org.junit.Assert.*; @@ -207,6 +208,11 @@ public void testBadCredentialsWithRemap() throws Exception { @Test public void testAnonymousProvider() throws Exception { Configuration conf = createConf(AnonymousAWSCredentialsProvider.class); + if (isUsingDefaultExternalDataFile(conf)) { + removeBaseAndBucketOverrides(conf, + ENDPOINT); + conf.set(ENDPOINT, CENTRAL_ENDPOINT); + } Path testFile = getExternalData(conf); try (FileSystem fs = FileSystem.newInstance(testFile.toUri(), conf)) { Assertions.assertThat(fs) diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEncryptionSSEKMSDefaultKey.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEncryptionSSEKMSDefaultKey.java index 7e399f347100f..f35f15c1131ac 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEncryptionSSEKMSDefaultKey.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEncryptionSSEKMSDefaultKey.java @@ -19,12 +19,18 @@ package org.apache.hadoop.fs.s3a; import java.io.IOException; +import java.util.Optional; +import org.junit.Test; import software.amazon.awssdk.services.s3.model.HeadObjectResponse; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.contract.ContractTestUtils; +import static org.apache.hadoop.fs.contract.ContractTestUtils.dataset; +import static org.apache.hadoop.fs.contract.ContractTestUtils.writeDataset; +import static org.apache.hadoop.fs.s3a.EncryptionTestUtils.validateEncryptionFileAttributes; import static org.hamcrest.CoreMatchers.containsString; /** @@ -56,4 +62,19 @@ protected void assertEncrypted(Path path) throws IOException { md.serverSideEncryptionAsString()); assertThat(md.ssekmsKeyId(), containsString("arn:aws:kms:")); } + + @Test + public void testEncryptionFileAttributes() throws Exception { + describe("Test for correct encryption file attributes for SSE-KMS with server default key."); + Path path = path(createFilename(1024)); + byte[] data = dataset(1024, 'a', 'z'); + S3AFileSystem fs = getFileSystem(); + writeDataset(fs, path, data, data.length, 1024 * 1024, true); + ContractTestUtils.verifyFileContents(fs, path, data); + // we don't know the KMS key in case of server default option. + validateEncryptionFileAttributes(fs, + path, + EncryptionTestUtils.AWS_KMS_SSE_ALGORITHM, + Optional.empty()); + } } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEncryptionWithDefaultS3Settings.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEncryptionWithDefaultS3Settings.java index c246161a938dd..423796bf82b87 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEncryptionWithDefaultS3Settings.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEncryptionWithDefaultS3Settings.java @@ -19,6 +19,7 @@ package org.apache.hadoop.fs.s3a; import java.io.IOException; +import java.util.Optional; import org.junit.Ignore; import org.junit.Test; @@ -36,6 +37,7 @@ import static org.apache.hadoop.fs.s3a.Constants.S3_ENCRYPTION_ALGORITHM; import static org.apache.hadoop.fs.s3a.Constants.SERVER_SIDE_ENCRYPTION_ALGORITHM; import static org.apache.hadoop.fs.s3a.EncryptionTestUtils.AWS_KMS_SSE_ALGORITHM; +import static org.apache.hadoop.fs.s3a.EncryptionTestUtils.validateEncryptionFileAttributes; import static org.apache.hadoop.fs.s3a.S3AEncryptionMethods.SSE_KMS; import static org.apache.hadoop.fs.s3a.S3ATestUtils.getTestBucketName; import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides; @@ -97,6 +99,22 @@ protected void assertEncrypted(Path path) throws IOException { EncryptionTestUtils.assertEncrypted(fs, path, SSE_KMS, kmsKey); } + @Test + public void testEncryptionFileAttributes() throws Exception { + describe("Test for correct encryption file attributes for SSE-KMS with user default setting."); + Path path = path(createFilename(1024)); + byte[] data = dataset(1024, 'a', 'z'); + S3AFileSystem fs = getFileSystem(); + writeDataset(fs, path, data, data.length, 1024 * 1024, true); + ContractTestUtils.verifyFileContents(fs, path, data); + Configuration c = fs.getConf(); + String kmsKey = getS3EncryptionKey(getTestBucketName(c), c); + validateEncryptionFileAttributes(fs, path, AWS_KMS_SSE_ALGORITHM, Optional.of(kmsKey)); + } + + + + @Override @Ignore @Test diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFailureHandling.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFailureHandling.java index 5f90115b8e797..b550fc5864b73 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFailureHandling.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFailureHandling.java @@ -32,9 +32,6 @@ import org.apache.hadoop.fs.store.audit.AuditSpan; import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import java.io.IOException; import java.util.ArrayList; @@ -45,7 +42,9 @@ import static org.apache.hadoop.fs.contract.ContractTestUtils.*; import static org.apache.hadoop.fs.s3a.S3ATestUtils.createFiles; import static org.apache.hadoop.fs.s3a.S3ATestUtils.isBulkDeleteEnabled; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides; import static org.apache.hadoop.fs.s3a.test.ExtraAssertions.failIf; +import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.isUsingDefaultExternalDataFile; import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.requireDefaultExternalData; import static org.apache.hadoop.test.LambdaTestUtils.*; import static org.apache.hadoop.util.functional.RemoteIterators.mappingRemoteIterator; @@ -55,14 +54,15 @@ * ITest for failure handling, primarily multipart deletion. */ public class ITestS3AFailureHandling extends AbstractS3ATestBase { - private static final Logger LOG = - LoggerFactory.getLogger(ITestS3AFailureHandling.class); @Override protected Configuration createConfiguration() { Configuration conf = super.createConfiguration(); S3ATestUtils.disableFilesystemCaching(conf); conf.setBoolean(Constants.ENABLE_MULTI_DELETE, true); + if (isUsingDefaultExternalDataFile(conf)) { + removeBaseAndBucketOverrides(conf, Constants.ENDPOINT); + } return conf; } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3APrefetchingCacheFiles.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3APrefetchingCacheFiles.java index ce962483d5840..5e6731ed520ad 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3APrefetchingCacheFiles.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3APrefetchingCacheFiles.java @@ -38,9 +38,11 @@ import org.apache.hadoop.fs.s3a.performance.AbstractS3ACostTest; import static org.apache.hadoop.fs.s3a.Constants.BUFFER_DIR; +import static org.apache.hadoop.fs.s3a.Constants.ENDPOINT; import static org.apache.hadoop.fs.s3a.Constants.PREFETCH_BLOCK_SIZE_KEY; import static org.apache.hadoop.fs.s3a.Constants.PREFETCH_ENABLED_KEY; import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.getExternalData; +import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.isUsingDefaultExternalDataFile; import static org.apache.hadoop.io.IOUtils.cleanupWithLogger; /** @@ -84,7 +86,11 @@ public void setUp() throws Exception { @Override public Configuration createConfiguration() { Configuration configuration = super.createConfiguration(); - S3ATestUtils.removeBaseAndBucketOverrides(configuration, PREFETCH_ENABLED_KEY); + if (isUsingDefaultExternalDataFile(configuration)) { + S3ATestUtils.removeBaseAndBucketOverrides(configuration, + PREFETCH_ENABLED_KEY, + ENDPOINT); + } configuration.setBoolean(PREFETCH_ENABLED_KEY, true); // use a small block size unless explicitly set in the test config. configuration.setInt(PREFETCH_BLOCK_SIZE_KEY, BLOCK_SIZE); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestDelegatedMRJob.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestDelegatedMRJob.java index ba9746358c575..4aaf35f0613e0 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestDelegatedMRJob.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestDelegatedMRJob.java @@ -33,6 +33,7 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.s3a.Constants; import org.apache.hadoop.fs.s3a.S3AFileSystem; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; @@ -53,12 +54,14 @@ import static org.apache.hadoop.fs.s3a.S3ATestUtils.deployService; import static org.apache.hadoop.fs.s3a.S3ATestUtils.disableFilesystemCaching; import static org.apache.hadoop.fs.s3a.S3ATestUtils.getTestPropertyInt; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides; import static org.apache.hadoop.fs.s3a.S3ATestUtils.terminateService; import static org.apache.hadoop.fs.s3a.auth.RoleTestUtils.probeForAssumedRoleARN; import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.*; import static org.apache.hadoop.fs.s3a.auth.delegation.MiniKerberizedHadoopCluster.assertSecurityEnabled; import static org.apache.hadoop.fs.s3a.auth.delegation.MiniKerberizedHadoopCluster.closeUserFileSystems; import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.getOrcData; +import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.isUsingDefaultExternalDataFile; import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.requireAnonymousDataPath; /** @@ -251,6 +254,10 @@ public void testCommonCrawlLookup() throws Throwable { public void testJobSubmissionCollectsTokens() throws Exception { describe("Mock Job test"); JobConf conf = new JobConf(getConfiguration()); + if (isUsingDefaultExternalDataFile(conf)) { + removeBaseAndBucketOverrides(conf, + Constants.ENDPOINT); + } // the input here is the external file; which lets // us differentiate source URI from dest URI diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/AbstractS3GuardToolTestBase.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/AbstractS3GuardToolTestBase.java index fd7a528a5d1b8..22a4727739aca 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/AbstractS3GuardToolTestBase.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/AbstractS3GuardToolTestBase.java @@ -118,12 +118,12 @@ protected int run(Object... args) throws Exception { * Run a S3GuardTool command from a varags list, catch any raised * ExitException and verify the status code matches that expected. * @param status expected status code of the exception + * @param conf configuration object. * @param args argument list * @throws Exception any exception */ - protected void runToFailure(int status, Object... args) + protected void runToFailure(int status, Configuration conf, Object... args) throws Exception { - final Configuration conf = getConfiguration(); ExitUtil.ExitException ex = intercept(ExitUtil.ExitException.class, () -> runS3GuardCommand(conf, args)); @@ -247,7 +247,7 @@ public void testUnsupported() throws Throwable { describe("Verify the unsupported tools are rejected"); for (String tool : UNSUPPORTED_COMMANDS) { describe("Probing %s", tool); - runToFailure(E_S3GUARD_UNSUPPORTED, tool); + runToFailure(E_S3GUARD_UNSUPPORTED, getConfiguration(), tool); } } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardTool.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardTool.java index 18fdccabaeaf2..59787617b884f 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardTool.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardTool.java @@ -37,6 +37,7 @@ import org.apache.hadoop.util.StringUtils; import static org.apache.hadoop.fs.contract.ContractTestUtils.skip; +import static org.apache.hadoop.fs.s3a.Constants.ENDPOINT; import static org.apache.hadoop.fs.s3a.Constants.FIPS_ENDPOINT; import static org.apache.hadoop.fs.s3a.Constants.S3_ENCRYPTION_ALGORITHM; import static org.apache.hadoop.fs.s3a.MultipartTestUtils.assertNoUploadsAt; @@ -48,6 +49,7 @@ import static org.apache.hadoop.fs.s3a.s3guard.S3GuardTool.E_BAD_STATE; import static org.apache.hadoop.fs.s3a.s3guard.S3GuardTool.Uploads; import static org.apache.hadoop.fs.s3a.s3guard.S3GuardToolTestHelper.exec; +import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.isUsingDefaultExternalDataFile; /** * Test S3Guard Tool CLI commands. @@ -60,8 +62,13 @@ public class ITestS3GuardTool extends AbstractS3GuardToolTestBase { @Test public void testExternalBucketRequireUnencrypted() throws Throwable { - removeBaseAndBucketOverrides(getConfiguration(), S3_ENCRYPTION_ALGORITHM); - run(BucketInfo.NAME, + Configuration conf = getConfiguration(); + if (isUsingDefaultExternalDataFile(conf)) { + removeBaseAndBucketOverrides(conf, + S3_ENCRYPTION_ALGORITHM, + ENDPOINT); + } + run(conf, BucketInfo.NAME, "-" + BucketInfo.ENCRYPTION_FLAG, "none", externalBucket()); } @@ -81,10 +88,17 @@ private String externalBucket() { @Test public void testExternalBucketRequireEncrypted() throws Throwable { + Configuration conf = getConfiguration(); + if (isUsingDefaultExternalDataFile(conf)) { + removeBaseAndBucketOverrides(conf, + ENDPOINT); + } runToFailure(E_BAD_STATE, + conf, BucketInfo.NAME, "-" + BucketInfo.ENCRYPTION_FLAG, - "AES256", externalBucket()); + "AES256", + externalBucket()); } @Test @@ -212,9 +226,13 @@ public void testUploadListByAge() throws Throwable { @Test public void testUploadNegativeExpect() throws Throwable { - runToFailure(E_BAD_STATE, Uploads.NAME, "-expect", "1", - path("/we/are/almost/postive/this/doesnt/exist/fhfsadfoijew") - .toString()); + Configuration conf = getConfiguration(); + runToFailure(E_BAD_STATE, + conf, + Uploads.NAME, + "-expect", + "1", + path("/we/are/almost/postive/this/doesnt/exist/fhfsadfoijew").toString()); } private void assertNumUploads(Path path, int numUploads) throws Exception { diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AInputStreamPerformance.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AInputStreamPerformance.java index ae09452372316..a787f52bd4d40 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AInputStreamPerformance.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AInputStreamPerformance.java @@ -60,6 +60,7 @@ import static org.apache.hadoop.fs.s3a.S3ATestUtils.assume; import static org.apache.hadoop.fs.s3a.S3ATestUtils.getInputStreamStatistics; import static org.apache.hadoop.fs.s3a.S3ATestUtils.getS3AInputStream; +import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.isUsingDefaultExternalDataFile; import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.assertThatStatisticMinimum; import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.lookupMaximumStatistic; import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.lookupMeanStatistic; @@ -99,7 +100,13 @@ public class ITestS3AInputStreamPerformance extends S3AScaleTestBase { @Override protected Configuration createScaleConfiguration() { Configuration conf = super.createScaleConfiguration(); - S3ATestUtils.removeBaseAndBucketOverrides(conf, PREFETCH_ENABLED_KEY); + S3ATestUtils.removeBaseAndBucketOverrides(conf, + PREFETCH_ENABLED_KEY); + if (isUsingDefaultExternalDataFile(conf)) { + S3ATestUtils.removeBaseAndBucketOverrides( + conf, + ENDPOINT); + } conf.setBoolean(PREFETCH_ENABLED_KEY, false); return conf; } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/test/PublicDatasetTestUtils.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/test/PublicDatasetTestUtils.java index 3835548b1e251..9400cfb3fb2ef 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/test/PublicDatasetTestUtils.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/test/PublicDatasetTestUtils.java @@ -133,6 +133,18 @@ public static String requireDefaultExternalDataFile(Configuration conf) { return filename; } + /** + * To determine whether {@value S3ATestConstants#KEY_CSVTEST_FILE} is configured to be + * different from the default external file. + * + * @param conf Configuration object. + * @return True if the default external data file is being used. + */ + public static boolean isUsingDefaultExternalDataFile(final Configuration conf) { + final String filename = getExternalData(conf).toUri().toString(); + return DEFAULT_EXTERNAL_FILE.equals(filename); + } + /** * Get the test external file; assume() that it is not modified (i.e. we haven't * switched to a new storage infrastructure where the bucket is no longer diff --git a/hadoop-tools/hadoop-azure/pom.xml b/hadoop-tools/hadoop-azure/pom.xml index 052015abd6ae8..460e0571a898c 100644 --- a/hadoop-tools/hadoop-azure/pom.xml +++ b/hadoop-tools/hadoop-azure/pom.xml @@ -45,6 +45,8 @@ 7200 10 1000 + + 100 @@ -400,7 +402,8 @@ ${fs.azure.scale.test.timeout} ${fs.azure.scale.test.list.performance.threads} ${fs.azure.scale.test.list.performance.files} - + + ${http.maxConnections} **/azure/Test*.java **/azure/**/Test*.java @@ -431,6 +434,8 @@ ${fs.azure.scale.test.timeout} ${fs.azure.scale.test.list.performance.threads} ${fs.azure.scale.test.list.performance.files} + + ${http.maxConnections} **/azure/**/TestRollingWindowAverage*.java @@ -604,6 +609,8 @@ ${fs.azure.scale.test.enabled} ${fs.azure.scale.test.timeout} + + ${http.maxConnections} @@ -792,6 +799,8 @@ ${fs.azure.scale.test.timeout} ${fs.azure.scale.test.list.performance.threads} ${fs.azure.scale.test.list.performance.files} + + ${http.maxConnections} @@ -842,7 +851,8 @@ ${fs.azure.scale.test.timeout} ${fs.azure.scale.test.list.performance.threads} ${fs.azure.scale.test.list.performance.files} - + + ${http.maxConnections} **/ITestWasbAbfsCompatibility.java **/ITestFileSystemOperationsExceptionHandlingMultiThreaded.java @@ -891,6 +901,8 @@ ${fs.azure.scale.test.timeout} ${fs.azure.scale.test.list.performance.threads} ${fs.azure.scale.test.list.performance.files} + + ${http.maxConnections} ${fs.azure.scale.test.timeout} false diff --git a/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azure/NativeAzureFileSystem.java b/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azure/NativeAzureFileSystem.java index 45fbf791908a3..4e777da8b409f 100644 --- a/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azure/NativeAzureFileSystem.java +++ b/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azure/NativeAzureFileSystem.java @@ -160,9 +160,12 @@ public FolderRenamePending(Path redoFile, NativeAzureFileSystem fs) // open redo file Path f = redoFile; - FSDataInputStream input = fs.open(f); - byte[] bytes = new byte[MAX_RENAME_PENDING_FILE_SIZE]; - int l = input.read(bytes); + int l; + byte[] bytes; + try (FSDataInputStream input = fs.open(f)) { + bytes = new byte[MAX_RENAME_PENDING_FILE_SIZE]; + l = input.read(bytes); + } if (l <= 0) { // Jira HADOOP-12678 -Handle empty rename pending metadata file during // atomic rename in redo path. If during renamepending file is created diff --git a/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/constants/FileSystemConfigurations.java b/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/constants/FileSystemConfigurations.java index ea7bf943a73d0..0af485bbe56b1 100644 --- a/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/constants/FileSystemConfigurations.java +++ b/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/constants/FileSystemConfigurations.java @@ -159,7 +159,7 @@ public final class FileSystemConfigurations { /** * IO rate limit. Value: {@value} */ - public static final int RATE_LIMIT_DEFAULT = 10_000; + public static final int RATE_LIMIT_DEFAULT = 1_000; private FileSystemConfigurations() {} } diff --git a/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/ITestAbfsRestOperationException.java b/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/ITestAbfsRestOperationException.java index 2672b676f9b3a..ca2ab9dabcf43 100644 --- a/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/ITestAbfsRestOperationException.java +++ b/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/ITestAbfsRestOperationException.java @@ -187,21 +187,18 @@ public void testAuthFailException() throws Exception { config.set(AZURE_CREATE_REMOTE_FILESYSTEM_DURING_INITIALIZATION, "false"); final AzureBlobFileSystem fs = getFileSystem(config); - try { - intercept(Exception.class, - () -> { - fs.getFileStatus(new Path("/")); - }); - } catch (AbfsRestOperationException e) { - String errorDesc = "Should throw RestOp exception on AAD failure"; - Assertions.assertThat(e.getStatusCode()) - .describedAs("Incorrect status code. " + errorDesc).isEqualTo(-1); - Assertions.assertThat(e.getErrorCode()) - .describedAs("Incorrect error code. " + errorDesc) - .isEqualTo(AzureServiceErrorCode.UNKNOWN); - Assertions.assertThat(e.getErrorMessage()) - .describedAs("Incorrect error message. " + errorDesc) - .contains("Auth failure: "); - } + AbfsRestOperationException e = intercept(AbfsRestOperationException.class, () -> { + fs.getFileStatus(new Path("/")); + }); + + String errorDesc = "Should throw RestOp exception on AAD failure"; + Assertions.assertThat(e.getStatusCode()) + .describedAs("Incorrect status code: " + errorDesc).isEqualTo(-1); + Assertions.assertThat(e.getErrorCode()) + .describedAs("Incorrect error code: " + errorDesc) + .isEqualTo(AzureServiceErrorCode.UNKNOWN); + Assertions.assertThat(e.getErrorMessage()) + .describedAs("Incorrect error message: " + errorDesc) + .contains("Auth failure: "); } -} \ No newline at end of file +} diff --git a/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/commit/AbfsCommitTestHelper.java b/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/commit/AbfsCommitTestHelper.java index da2a650489077..92ba8a4024a2c 100644 --- a/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/commit/AbfsCommitTestHelper.java +++ b/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/commit/AbfsCommitTestHelper.java @@ -23,6 +23,7 @@ import org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants; import static org.apache.hadoop.fs.azurebfs.constants.ConfigurationKeys.AZURE_READ_SMALL_FILES_COMPLETELY; +import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.OPT_CLEANUP_PARALLEL_DELETE_BASE_FIRST; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.OPT_STORE_OPERATIONS_CLASS; /** @@ -51,9 +52,10 @@ static Configuration prepareTestConfiguration( final String size = Integer.toString(192); conf.setIfUnset(ManifestCommitterConstants.OPT_IO_PROCESSORS, size); conf.setIfUnset(ManifestCommitterConstants.OPT_WRITER_QUEUE_CAPACITY, size); - // no need for parallel delete here as we aren't at the scale where unified delete - // is going to time out - conf.setBooleanIfUnset(ManifestCommitterConstants.OPT_CLEANUP_PARALLEL_DELETE, false); + // enable parallel delete but ask for base deletion first, + // which is now our recommended azure option + conf.setBoolean(ManifestCommitterConstants.OPT_CLEANUP_PARALLEL_DELETE, true); + conf.setBoolean(OPT_CLEANUP_PARALLEL_DELETE_BASE_FIRST, true); return conf; } diff --git a/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/commit/ITestAbfsTerasort.java b/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/commit/ITestAbfsTerasort.java index 4b21b838decc5..820938b2d68ef 100644 --- a/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/commit/ITestAbfsTerasort.java +++ b/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/commit/ITestAbfsTerasort.java @@ -39,6 +39,7 @@ import org.apache.hadoop.examples.terasort.TeraSortConfigKeys; import org.apache.hadoop.examples.terasort.TeraValidate; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.statistics.IOStatistics; import org.apache.hadoop.fs.statistics.IOStatisticsLogging; import org.apache.hadoop.fs.statistics.IOStatisticsSnapshot; import org.apache.hadoop.mapred.JobConf; @@ -52,6 +53,9 @@ import static java.util.Optional.empty; import static org.apache.hadoop.fs.CommonConfigurationKeys.IOSTATISTICS_LOGGING_LEVEL_INFO; import static org.apache.hadoop.fs.statistics.IOStatisticsSupport.snapshotIOStatistics; +import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_RENAME_FILE; +import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_SAVE_TASK_MANIFEST; +import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterTestSupport.assertNoFailureStatistics; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterTestSupport.loadSuccessFile; import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterTestSupport.validateSuccessFile; @@ -95,6 +99,11 @@ public class ITestAbfsTerasort extends AbstractAbfsClusterITest { protected static final IOStatisticsSnapshot JOB_IOSTATS = snapshotIOStatistics(); + /** + * Map of stage -> success file. + */ + private static final Map SUCCESS_FILES = new HashMap<>(); + /** Base path for all the terasort input and output paths. */ private Path terasortPath; @@ -188,9 +197,10 @@ private static void requireStage(final String stage) { * @param tool tool to run. * @param args args for the tool. * @param minimumFileCount minimum number of files to have been created + * @return the job success file. * @throws Exception any failure */ - private void executeStage( + private ManifestSuccessData executeStage( final String stage, final JobConf jobConf, final Path dest, @@ -213,9 +223,20 @@ private void executeStage( + " failed", 0, result); final ManifestSuccessData successFile = validateSuccessFile(getFileSystem(), dest, minimumFileCount, ""); - JOB_IOSTATS.aggregate(successFile.getIOStatistics()); - + final IOStatistics iostats = successFile.getIOStatistics(); + JOB_IOSTATS.aggregate(iostats); + SUCCESS_FILES.put(stage, successFile); completedStage(stage, d); + + // now assert there were no failures recorded in the IO statistics + // for critical functions. + // these include collected statistics from manifest save + // operations. + assertNoFailureStatistics(iostats, + stage, + OP_SAVE_TASK_MANIFEST, + OP_RENAME_FILE); + return successFile; } /** @@ -319,6 +340,7 @@ public void test_140_teracomplete() throws Throwable { File resultsFile = File.createTempFile("results", ".csv"); FileUtils.write(resultsFile, text, StandardCharsets.UTF_8); LOG.info("Results are in {}\n{}", resultsFile, text); + LOG.info("Report directory {}", getReportDir()); } /** diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 650e82d673813..2a51065404cac 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -2733,6 +2733,10 @@ public static boolean isAclEnabled(Configuration conf) { public static final String NM_LINUX_CONTAINER_CGROUPS_MOUNT_PATH = NM_PREFIX + "linux-container-executor.cgroups.mount-path"; + /** Where the linux container executor should mount cgroups v2 if not found. */ + public static final String NM_LINUX_CONTAINER_CGROUPS_V2_MOUNT_PATH = + NM_PREFIX + "linux-container-executor.cgroups.v2.mount-path"; + /** * Whether the apps should run in strict resource usage mode(not allowed to * use spare CPU) @@ -2804,6 +2808,14 @@ public static boolean isAclEnabled(Configuration conf) { public static final long DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY = 20; + /** + * Boolean indicating whether cgroup v2 is enabled. + */ + public static final String NM_LINUX_CONTAINER_CGROUPS_V2_ENABLED = + NM_PREFIX + "linux-container-executor.cgroups.v2.enabled"; + + public static final boolean DEFAULT_NM_LINUX_CONTAINER_CGROUPS_V2_ENABLED = false; + /** * Indicates if memory and CPU limits will be set for the Windows Job * Object for the containers launched by the default container executor. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-catalog/hadoop-yarn-applications-catalog-docker/Dockerfile b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-catalog/hadoop-yarn-applications-catalog-docker/Dockerfile index 4a3ec65fb0005..2369bc10adc97 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-catalog/hadoop-yarn-applications-catalog-docker/Dockerfile +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-catalog/hadoop-yarn-applications-catalog-docker/Dockerfile @@ -19,7 +19,7 @@ FROM centos:7 RUN yum -y install tomcat lsof krb5-workstation sssd-client curl SHELL ["/bin/bash", "-o", "pipefail", "-c"] RUN mkdir -p /opt/apache/solr && \ - curl -SL http://archive.apache.org/dist/lucene/solr/7.7.0/solr-7.7.0.tgz | \ + curl -SL https://downloads.apache.org/lucene/solr/8.11.2/solr-8.11.2.tgz | \ tar -xzC /opt/apache/solr --strip 1 COPY src/main/scripts/setup-image.sh /setup-image.sh COPY src/main/resources/samples.xml /tmp/samples.xml diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 6b2d2cd817c65..275fc08cb2ca9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -2087,6 +2087,20 @@ yarn.nodemanager.linux-container-executor.cgroups.mount-path + + This property sets the mount path for CGroups v2. + This parameter is optional, and needed to be set only in mixed mode, + when CGroups v2 is mounted alongside with Cgroups v1. + For example, in hybrid mode, CGroups v1 controllers can be mounted under /sys/fs/cgroup/ + (for example /sys/fs/cgroup/cpu,cpuacct), while v2 can be mounted in /sys/fs/cgroup/unified folder. + + If this value is not set, the value of + yarn.nodemanager.linux-container-executor.cgroups.mount-path + will be used for CGroups v2 as well. + + yarn.nodemanager.linux-container-executor.cgroups.v2.mount-path + + Delay in ms between attempts to remove linux cgroup yarn.nodemanager.linux-container-executor.cgroups.delete-delay-ms @@ -2464,6 +2478,12 @@ 1000 + + yarn.nodemanager.linux-container-executor.cgroups.v2.enabled + false + Boolean indicating whether cgroup v2 is enabled. + + T-file compression types used to compress aggregated logs. yarn.nodemanager.log-aggregation.compression-type diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/AbstractCGroupsHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/AbstractCGroupsHandler.java index a8f528a209113..becb68e22f0ff 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/AbstractCGroupsHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/AbstractCGroupsHandler.java @@ -358,14 +358,14 @@ private void initializePreMountedCGroupController(CGroupController controller) } else { // Unexpected: we just checked that it was missing throw new ResourceHandlerException(getErrorWithDetails( - "Unexpected: Cannot create yarn cgroup", + "Unexpected: Cannot create yarn cgroup hierarchy", subsystemName, yarnHierarchy.getAbsolutePath() )); } } catch (SecurityException e) { throw new ResourceHandlerException(getErrorWithDetails( - "No permissions to create yarn cgroup", + "No permissions to create yarn cgroup hierarchy", subsystemName, yarnHierarchy.getAbsolutePath() ), e); @@ -378,15 +378,7 @@ private void initializePreMountedCGroupController(CGroupController controller) )); } - try { - updateEnabledControllersInHierarchy(yarnHierarchy, controller); - } catch (ResourceHandlerException e) { - throw new ResourceHandlerException(getErrorWithDetails( - "Failed to update cgroup.subtree_control in yarn hierarchy", - subsystemName, - yarnHierarchy.getAbsolutePath() - )); - } + updateEnabledControllersInHierarchy(yarnHierarchy, controller); } protected abstract void updateEnabledControllersInHierarchy( @@ -401,7 +393,7 @@ protected abstract void updateEnabledControllersInHierarchy( * @param yarnCgroupPath cgroup path that failed * @return a string builder that can be appended by the caller */ - private String getErrorWithDetails( + protected String getErrorWithDetails( String errorMessage, String subsystemName, String yarnCgroupPath) { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/AbstractCGroupsMemoryResourceHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/AbstractCGroupsMemoryResourceHandler.java new file mode 100644 index 0000000000000..36c21ab93f56d --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/AbstractCGroupsMemoryResourceHandler.java @@ -0,0 +1,158 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.api.records.ExecutionType; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.security.ContainerTokenIdentifier; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +@InterfaceStability.Unstable +@InterfaceAudience.Private +public abstract class AbstractCGroupsMemoryResourceHandler implements MemoryResourceHandler { + + static final Logger LOG = + LoggerFactory.getLogger(CGroupsMemoryResourceHandlerImpl.class); + protected static final CGroupsHandler.CGroupController MEMORY = + CGroupsHandler.CGroupController.MEMORY; + + private CGroupsHandler cGroupsHandler; + + protected static final int OPPORTUNISTIC_SOFT_LIMIT = 0; + // multiplier to set the soft limit - value should be between 0 and 1 + private float softLimit = 0.0f; + private boolean enforce = true; + + public AbstractCGroupsMemoryResourceHandler(CGroupsHandler cGroupsHandler) { + this.cGroupsHandler = cGroupsHandler; + } + + protected CGroupsHandler getCGroupsHandler() { + return cGroupsHandler; + } + + @Override + public List bootstrap(Configuration conf) + throws ResourceHandlerException { + this.cGroupsHandler.initializeCGroupController(MEMORY); + enforce = conf.getBoolean( + YarnConfiguration.NM_MEMORY_RESOURCE_ENFORCED, + YarnConfiguration.DEFAULT_NM_MEMORY_RESOURCE_ENFORCED); + float softLimitPerc = conf.getFloat( + YarnConfiguration.NM_MEMORY_RESOURCE_CGROUPS_SOFT_LIMIT_PERCENTAGE, + YarnConfiguration. + DEFAULT_NM_MEMORY_RESOURCE_CGROUPS_SOFT_LIMIT_PERCENTAGE); + softLimit = softLimitPerc / 100.0f; + if (softLimitPerc < 0.0f || softLimitPerc > 100.0f) { + throw new ResourceHandlerException( + "Illegal value '" + softLimitPerc + "' " + + YarnConfiguration. + NM_MEMORY_RESOURCE_CGROUPS_SOFT_LIMIT_PERCENTAGE + + ". Value must be between 0 and 100."); + } + return null; + } + + @Override + public List updateContainer(Container container) + throws ResourceHandlerException { + String cgroupId = container.getContainerId().toString(); + File cgroup = new File(cGroupsHandler.getPathForCGroup(MEMORY, cgroupId)); + if (cgroup.exists()) { + //memory is in MB + long containerSoftLimit = + (long) (container.getResource().getMemorySize() * this.softLimit); + long containerHardLimit = container.getResource().getMemorySize(); + if (enforce) { + try { + updateMemoryHardLimit(cgroupId, containerHardLimit); + ContainerTokenIdentifier id = container.getContainerTokenIdentifier(); + if (id != null && id.getExecutionType() == + ExecutionType.OPPORTUNISTIC) { + updateOpportunisticMemoryLimits(cgroupId); + } else { + updateGuaranteedMemoryLimits(cgroupId, containerSoftLimit); + } + } catch (ResourceHandlerException re) { + cGroupsHandler.deleteCGroup(MEMORY, cgroupId); + LOG.warn("Could not update cgroup for container", re); + throw re; + } + } + } + return null; + } + + protected abstract void updateMemoryHardLimit(String cgroupId, long containerHardLimit) + throws ResourceHandlerException; + + protected abstract void updateOpportunisticMemoryLimits(String cgroupId) + throws ResourceHandlerException; + + protected abstract void updateGuaranteedMemoryLimits(String cgroupId, long containerSoftLimit) + throws ResourceHandlerException; + + @Override + public List reacquireContainer(ContainerId containerId) + throws ResourceHandlerException { + return null; + } + + @Override + public List preStart(Container container) + throws ResourceHandlerException { + String cgroupId = container.getContainerId().toString(); + cGroupsHandler.createCGroup(MEMORY, cgroupId); + updateContainer(container); + List ret = new ArrayList<>(); + ret.add(new PrivilegedOperation( + PrivilegedOperation.OperationType.ADD_PID_TO_CGROUP, + PrivilegedOperation.CGROUP_ARG_PREFIX + + cGroupsHandler.getPathForCGroupTasks(MEMORY, cgroupId))); + return ret; + } + + @Override + public List postComplete(ContainerId containerId) + throws ResourceHandlerException { + cGroupsHandler.deleteCGroup(MEMORY, containerId.toString()); + return null; + } + + @Override + public List teardown() throws ResourceHandlerException { + return null; + } + + @Override + public String toString() { + return AbstractCGroupsMemoryResourceHandler.class.getName(); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandler.java index b8b4b2b7e3e76..e13d390e64c61 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandler.java @@ -123,6 +123,8 @@ public static Set getValidV2CGroups() { String CGROUP_CONTROLLERS_FILE = "cgroup.controllers"; String CGROUP_SUBTREE_CONTROL_FILE = "cgroup.subtree_control"; String CGROUP_CPU_MAX = "max"; + String CGROUP_MEMORY_MAX = "max"; + String CGROUP_MEMORY_LOW = "low"; // present in v1 and v2 String CGROUP_PROCS_FILE = "cgroup.procs"; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsMemoryResourceHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsMemoryResourceHandlerImpl.java index ee1cc0c605e52..b8ee85905272f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsMemoryResourceHandlerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsMemoryResourceHandlerImpl.java @@ -19,20 +19,12 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources; import org.apache.hadoop.classification.VisibleForTesting; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.yarn.api.records.ContainerId; -import org.apache.hadoop.yarn.api.records.ExecutionType; import org.apache.hadoop.yarn.conf.YarnConfiguration; -import org.apache.hadoop.yarn.security.ContainerTokenIdentifier; -import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation; -import java.io.File; -import java.util.ArrayList; import java.util.List; /** @@ -43,32 +35,19 @@ */ @InterfaceAudience.Private @InterfaceStability.Unstable -public class CGroupsMemoryResourceHandlerImpl implements MemoryResourceHandler { +public class CGroupsMemoryResourceHandlerImpl extends AbstractCGroupsMemoryResourceHandler { - static final Logger LOG = - LoggerFactory.getLogger(CGroupsMemoryResourceHandlerImpl.class); - private static final CGroupsHandler.CGroupController MEMORY = - CGroupsHandler.CGroupController.MEMORY; private static final int OPPORTUNISTIC_SWAPPINESS = 100; - private static final int OPPORTUNISTIC_SOFT_LIMIT = 0; - - private CGroupsHandler cGroupsHandler; - private boolean enforce = true; private int swappiness = 0; - // multiplier to set the soft limit - value should be between 0 and 1 - private float softLimit = 0.0f; CGroupsMemoryResourceHandlerImpl(CGroupsHandler cGroupsHandler) { - this.cGroupsHandler = cGroupsHandler; + super(cGroupsHandler); } @Override public List bootstrap(Configuration conf) throws ResourceHandlerException { - this.cGroupsHandler.initializeCGroupController(MEMORY); - enforce = conf.getBoolean( - YarnConfiguration.NM_MEMORY_RESOURCE_ENFORCED, - YarnConfiguration.DEFAULT_NM_MEMORY_RESOURCE_ENFORCED); + super.bootstrap(conf); swappiness = conf .getInt(YarnConfiguration.NM_MEMORY_RESOURCE_CGROUPS_SWAPPINESS, YarnConfiguration.DEFAULT_NM_MEMORY_RESOURCE_CGROUPS_SWAPPINESS); @@ -78,18 +57,6 @@ public List bootstrap(Configuration conf) + YarnConfiguration.NM_MEMORY_RESOURCE_CGROUPS_SWAPPINESS + ". Value must be between 0 and 100."); } - float softLimitPerc = conf.getFloat( - YarnConfiguration.NM_MEMORY_RESOURCE_CGROUPS_SOFT_LIMIT_PERCENTAGE, - YarnConfiguration. - DEFAULT_NM_MEMORY_RESOURCE_CGROUPS_SOFT_LIMIT_PERCENTAGE); - softLimit = softLimitPerc / 100.0f; - if (softLimitPerc < 0.0f || softLimitPerc > 100.0f) { - throw new ResourceHandlerException( - "Illegal value '" + softLimitPerc + "' " - + YarnConfiguration. - NM_MEMORY_RESOURCE_CGROUPS_SOFT_LIMIT_PERCENTAGE - + ". Value must be between 0 and 100."); - } return null; } @@ -99,81 +66,31 @@ int getSwappiness() { } @Override - public List reacquireContainer(ContainerId containerId) - throws ResourceHandlerException { - return null; - } - - @Override - public List updateContainer(Container container) + protected void updateMemoryHardLimit(String cgroupId, long containerHardLimit) throws ResourceHandlerException { - String cgroupId = container.getContainerId().toString(); - File cgroup = new File(cGroupsHandler.getPathForCGroup(MEMORY, cgroupId)); - if (cgroup.exists()) { - //memory is in MB - long containerSoftLimit = - (long) (container.getResource().getMemorySize() * this.softLimit); - long containerHardLimit = container.getResource().getMemorySize(); - if (enforce) { - try { - cGroupsHandler.updateCGroupParam(MEMORY, cgroupId, - CGroupsHandler.CGROUP_PARAM_MEMORY_HARD_LIMIT_BYTES, - String.valueOf(containerHardLimit) + "M"); - ContainerTokenIdentifier id = container.getContainerTokenIdentifier(); - if (id != null && id.getExecutionType() == - ExecutionType.OPPORTUNISTIC) { - cGroupsHandler.updateCGroupParam(MEMORY, cgroupId, - CGroupsHandler.CGROUP_PARAM_MEMORY_SOFT_LIMIT_BYTES, - String.valueOf(OPPORTUNISTIC_SOFT_LIMIT) + "M"); - cGroupsHandler.updateCGroupParam(MEMORY, cgroupId, - CGroupsHandler.CGROUP_PARAM_MEMORY_SWAPPINESS, - String.valueOf(OPPORTUNISTIC_SWAPPINESS)); - } else { - cGroupsHandler.updateCGroupParam(MEMORY, cgroupId, - CGroupsHandler.CGROUP_PARAM_MEMORY_SOFT_LIMIT_BYTES, - String.valueOf(containerSoftLimit) + "M"); - cGroupsHandler.updateCGroupParam(MEMORY, cgroupId, - CGroupsHandler.CGROUP_PARAM_MEMORY_SWAPPINESS, - String.valueOf(swappiness)); - } - } catch (ResourceHandlerException re) { - cGroupsHandler.deleteCGroup(MEMORY, cgroupId); - LOG.warn("Could not update cgroup for container", re); - throw re; - } - } - } - return null; + getCGroupsHandler().updateCGroupParam(MEMORY, cgroupId, + CGroupsHandler.CGROUP_PARAM_MEMORY_HARD_LIMIT_BYTES, + String.valueOf(containerHardLimit) + "M"); } @Override - public List preStart(Container container) - throws ResourceHandlerException { - String cgroupId = container.getContainerId().toString(); - cGroupsHandler.createCGroup(MEMORY, cgroupId); - updateContainer(container); - List ret = new ArrayList<>(); - ret.add(new PrivilegedOperation( - PrivilegedOperation.OperationType.ADD_PID_TO_CGROUP, - PrivilegedOperation.CGROUP_ARG_PREFIX - + cGroupsHandler.getPathForCGroupTasks(MEMORY, cgroupId))); - return ret; + protected void updateOpportunisticMemoryLimits(String cgroupId) throws ResourceHandlerException { + getCGroupsHandler().updateCGroupParam(MEMORY, cgroupId, + CGroupsHandler.CGROUP_PARAM_MEMORY_SOFT_LIMIT_BYTES, + String.valueOf(OPPORTUNISTIC_SOFT_LIMIT) + "M"); + getCGroupsHandler().updateCGroupParam(MEMORY, cgroupId, + CGroupsHandler.CGROUP_PARAM_MEMORY_SWAPPINESS, + String.valueOf(OPPORTUNISTIC_SWAPPINESS)); } @Override - public List postComplete(ContainerId containerId) + protected void updateGuaranteedMemoryLimits(String cgroupId, long containerSoftLimit) throws ResourceHandlerException { - cGroupsHandler.deleteCGroup(MEMORY, containerId.toString()); - return null; - } - - @Override - public List teardown() throws ResourceHandlerException { - return null; - } - - @Override - public String toString() { - return CGroupsMemoryResourceHandlerImpl.class.getName(); + getCGroupsHandler().updateCGroupParam(MEMORY, cgroupId, + CGroupsHandler.CGROUP_PARAM_MEMORY_SOFT_LIMIT_BYTES, + String.valueOf(containerSoftLimit) + "M"); + getCGroupsHandler().updateCGroupParam(MEMORY, cgroupId, + CGroupsHandler.CGROUP_PARAM_MEMORY_SWAPPINESS, + String.valueOf(swappiness)); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsMountConfig.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsMountConfig.java index 6eb8667f2a808..b5a71f6072846 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsMountConfig.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsMountConfig.java @@ -26,11 +26,17 @@ public class CGroupsMountConfig { private final boolean enableMount; private final String mountPath; + // CGroups v2 mount path is only relevant in mixed CGroups v1/v2 mode, + // where v2 is mounted alongside with v1. + private final String v2MountPath; + public CGroupsMountConfig(Configuration conf) { this.enableMount = conf.getBoolean(YarnConfiguration. NM_LINUX_CONTAINER_CGROUPS_MOUNT, false); this.mountPath = conf.get(YarnConfiguration. NM_LINUX_CONTAINER_CGROUPS_MOUNT_PATH, null); + this.v2MountPath = conf.get(YarnConfiguration. + NM_LINUX_CONTAINER_CGROUPS_V2_MOUNT_PATH, mountPath); } public boolean ensureMountPathIsDefined() throws ResourceHandlerException { @@ -62,11 +68,16 @@ public String getMountPath() { return mountPath; } + public String getV2MountPath() { + return v2MountPath; + } + @Override public String toString() { return "CGroupsMountConfig{" + "enableMount=" + enableMount + - ", mountPath='" + mountPath + '\'' + + ", mountPath='" + mountPath + + ", v2MountPath='" + v2MountPath + '\'' + '}'; } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsV2HandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsV2HandlerImpl.java index 312627f89ba39..356a4c42dfb31 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsV2HandlerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsV2HandlerImpl.java @@ -97,10 +97,8 @@ protected List getCGroupControllers() { @Override protected Map> parsePreConfiguredMountPath() throws IOException { Map> controllerMappings = new HashMap<>(); - String controllerPath = this.cGroupsMountConfig.getMountPath() + - Path.SEPARATOR + this.cGroupPrefix; - controllerMappings.put(this.cGroupsMountConfig.getMountPath(), - readControllersFile(controllerPath)); + controllerMappings.put(this.cGroupsMountConfig.getV2MountPath(), + readControllersFile(this.cGroupsMountConfig.getV2MountPath())); return controllerMappings; } @@ -171,19 +169,32 @@ protected void updateEnabledControllersInHierarchy( try { Set enabledControllers = readControllersFile(yarnHierarchy.getAbsolutePath()); if (!enabledControllers.contains(controller.getName())) { - throw new ResourceHandlerException(String.format( + String errorMsg = String.format( "The controller %s is not enabled in the cgroup hierarchy: %s. Please enable it in " + "in the %s/cgroup.subtree_control file.", controller.getName(), yarnHierarchy.getAbsolutePath(), - yarnHierarchy.getParentFile().getAbsolutePath())); + yarnHierarchy.getParentFile().getAbsolutePath()); + + throw new ResourceHandlerException(getErrorWithDetails( + errorMsg, controller.getName(), + yarnHierarchy.getAbsolutePath())); } File subtreeControlFile = new File(yarnHierarchy.getAbsolutePath() + Path.SEPARATOR + CGROUP_SUBTREE_CONTROL_FILE); if (!subtreeControlFile.exists()) { - throw new ResourceHandlerException( - "No subtree control file found in the cgroup hierarchy: " + - yarnHierarchy.getAbsolutePath()); + String errorMsg = "No subtree control file found in the cgroup hierarchy: " + + yarnHierarchy.getAbsolutePath(); + throw new ResourceHandlerException(getErrorWithDetails( + errorMsg, controller.getName(), + yarnHierarchy.getAbsolutePath())); + } + if (!subtreeControlFile.canWrite()) { + String errorMsg = "Cannot write the cgroup.subtree_control file in the " + + "cgroup hierarchy: " + yarnHierarchy.getAbsolutePath(); + throw new ResourceHandlerException(getErrorWithDetails( + errorMsg, controller.getName(), + yarnHierarchy.getAbsolutePath())); } Writer w = new OutputStreamWriter(Files.newOutputStream(subtreeControlFile.toPath(), @@ -194,16 +205,20 @@ protected void updateEnabledControllersInHierarchy( yarnHierarchy.getAbsolutePath()); pw.write("+" + controller.getName()); if (pw.checkError()) { - throw new ResourceHandlerException("Failed to add the controller to the " + + String errorMsg = "Failed to add the controller to the " + "cgroup.subtree_control file in the cgroup hierarchy: " + - yarnHierarchy.getAbsolutePath()); + yarnHierarchy.getAbsolutePath(); + throw new ResourceHandlerException(getErrorWithDetails( + errorMsg, controller.getName(), + yarnHierarchy.getAbsolutePath())); } } } catch (IOException e) { - throw new ResourceHandlerException( - "Failed to update the cgroup.subtree_control file in the cgroup hierarchy: " + - yarnHierarchy.getAbsolutePath(), e); + String errorMsg = "Failed to update the cgroup.subtree_control file in the " + + "cgroup hierarchy: " + yarnHierarchy.getAbsolutePath(); + throw new ResourceHandlerException(getErrorWithDetails( + errorMsg, controller.getName(), + yarnHierarchy.getAbsolutePath())); } } - } \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsV2MemoryResourceHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsV2MemoryResourceHandlerImpl.java new file mode 100644 index 0000000000000..bc6e33c553b43 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsV2MemoryResourceHandlerImpl.java @@ -0,0 +1,51 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources; + +/** + * Handler class to handle the memory controller. YARN already ships a + * physical memory monitor in Java but it isn't as + * good as CGroups. This handler sets the soft and hard memory limits. The soft + * limit is set to 90% of the hard limit. + */ +public class CGroupsV2MemoryResourceHandlerImpl extends AbstractCGroupsMemoryResourceHandler { + + CGroupsV2MemoryResourceHandlerImpl(CGroupsHandler cGroupsHandler) { + super(cGroupsHandler); + } + + @Override + protected void updateMemoryHardLimit(String cgroupId, long containerHardLimit) + throws ResourceHandlerException { + getCGroupsHandler().updateCGroupParam(MEMORY, cgroupId, + CGroupsHandler.CGROUP_MEMORY_MAX, String.valueOf(containerHardLimit) + "M"); + } + + @Override + protected void updateOpportunisticMemoryLimits(String cgroupId) throws ResourceHandlerException { + updateGuaranteedMemoryLimits(cgroupId, OPPORTUNISTIC_SOFT_LIMIT); + } + + @Override + protected void updateGuaranteedMemoryLimits(String cgroupId, long containerSoftLimit) + throws ResourceHandlerException { + getCGroupsHandler().updateCGroupParam(MEMORY, cgroupId, + CGroupsHandler.CGROUP_MEMORY_LOW, String.valueOf(containerSoftLimit) + "M"); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java index 2ee2f44af204e..98492779b01e5 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java @@ -55,6 +55,7 @@ public class ResourceHandlerModule { static final Logger LOG = LoggerFactory.getLogger(ResourceHandlerModule.class); + private static boolean cgroupsV2Enabled; private static volatile ResourceHandlerChain resourceHandlerChain; /** @@ -62,35 +63,55 @@ public class ResourceHandlerModule { * as resource metrics functionality. We need to ensure that the same * instance is used for both. */ + private static volatile CGroupsHandler cGroupV1Handler; + private static volatile CGroupsHandler cGroupV2Handler; private static volatile TrafficControlBandwidthHandlerImpl trafficControlBandwidthHandler; private static volatile NetworkPacketTaggingHandlerImpl networkPacketTaggingHandlerImpl; - private static volatile CGroupsHandler cGroupsHandler; private static volatile CGroupsBlkioResourceHandlerImpl cGroupsBlkioResourceHandler; - private static volatile CGroupsMemoryResourceHandlerImpl + private static volatile MemoryResourceHandler cGroupsMemoryResourceHandler; - private static volatile CGroupsCpuResourceHandlerImpl + private static volatile CpuResourceHandler cGroupsCpuResourceHandler; - /** - * Returns an initialized, thread-safe CGroupsHandler instance. - */ - private static CGroupsHandler getInitializedCGroupsHandler(Configuration conf) + private static void initializeCGroupHandlers(Configuration conf) + throws ResourceHandlerException { + initializeCGroupV1Handler(conf); + if (cgroupsV2Enabled) { + initializeCGroupV2Handler(conf); + } + } + + private static void initializeCGroupV1Handler(Configuration conf) throws ResourceHandlerException { - if (cGroupsHandler == null) { + if (cGroupV1Handler == null) { synchronized (CGroupsHandler.class) { - if (cGroupsHandler == null) { - // TODO determine cgroup version - cGroupsHandler = new CGroupsHandlerImpl(conf, - PrivilegedOperationExecutor.getInstance(conf)); - LOG.debug("Value of CGroupsHandler is: {}", cGroupsHandler); + if (cGroupV1Handler == null) { + cGroupV1Handler = new CGroupsHandlerImpl( + conf, PrivilegedOperationExecutor.getInstance(conf)); + LOG.debug("Value of CGroupsV1Handler is: {}", cGroupV1Handler); } } } + } - return cGroupsHandler; + private static void initializeCGroupV2Handler(Configuration conf) + throws ResourceHandlerException { + if (cGroupV2Handler == null) { + synchronized (CGroupsHandler.class) { + if (cGroupV2Handler == null) { + cGroupV2Handler = new CGroupsV2HandlerImpl( + conf, PrivilegedOperationExecutor.getInstance(conf)); + LOG.debug("Value of CGroupsV2Handler is: {}", cGroupV2Handler); + } + } + } + } + + private static boolean isMountedInCGroupsV2(CGroupsHandler.CGroupController controller) { + return (cGroupV2Handler != null && cGroupV2Handler.getControllerPath(controller) != null); } /** @@ -100,7 +121,7 @@ private static CGroupsHandler getInitializedCGroupsHandler(Configuration conf) */ public static CGroupsHandler getCGroupsHandler() { - return cGroupsHandler; + return cGroupV1Handler; } /** @@ -108,10 +129,10 @@ public static CGroupsHandler getCGroupsHandler() { * not initialized, or if the path is empty. */ public static String getCgroupsRelativeRoot() { - if (cGroupsHandler == null) { + if (getCGroupsHandler() == null) { return null; } - String cGroupPath = cGroupsHandler.getRelativePathForCGroup(""); + String cGroupPath = getCGroupsHandler().getRelativePathForCGroup(""); if (cGroupPath == null || cGroupPath.isEmpty()) { return null; } @@ -138,7 +159,7 @@ public static String getCgroupsRelativeRoot() { return cGroupsCpuResourceHandler; } - private static CGroupsCpuResourceHandlerImpl initCGroupsCpuResourceHandler( + private static CpuResourceHandler initCGroupsCpuResourceHandler( Configuration conf) throws ResourceHandlerException { boolean cgroupsCpuEnabled = conf.getBoolean(YarnConfiguration.NM_CPU_RESOURCE_ENABLED, @@ -152,9 +173,13 @@ private static CGroupsCpuResourceHandlerImpl initCGroupsCpuResourceHandler( synchronized (CpuResourceHandler.class) { if (cGroupsCpuResourceHandler == null) { LOG.debug("Creating new cgroups cpu handler"); - cGroupsCpuResourceHandler = - new CGroupsCpuResourceHandlerImpl( - getInitializedCGroupsHandler(conf)); + + initializeCGroupHandlers(conf); + if (isMountedInCGroupsV2(CGroupsHandler.CGroupController.CPU)) { + cGroupsCpuResourceHandler = new CGroupsV2CpuResourceHandlerImpl(cGroupV2Handler); + } else { + cGroupsCpuResourceHandler = new CGroupsCpuResourceHandlerImpl(cGroupV1Handler); + } return cGroupsCpuResourceHandler; } } @@ -172,9 +197,11 @@ private static CGroupsCpuResourceHandlerImpl initCGroupsCpuResourceHandler( synchronized (OutboundBandwidthResourceHandler.class) { if (trafficControlBandwidthHandler == null) { LOG.info("Creating new traffic control bandwidth handler."); + + initializeCGroupHandlers(conf); trafficControlBandwidthHandler = new TrafficControlBandwidthHandlerImpl(PrivilegedOperationExecutor - .getInstance(conf), getInitializedCGroupsHandler(conf), + .getInstance(conf), cGroupV1Handler, new TrafficController(conf, PrivilegedOperationExecutor .getInstance(conf))); } @@ -207,10 +234,11 @@ public static ResourceHandler getNetworkTaggingHandler(Configuration conf) synchronized (OutboundBandwidthResourceHandler.class) { if (networkPacketTaggingHandlerImpl == null) { LOG.info("Creating new network-tagging-handler."); + + initializeCGroupHandlers(conf); networkPacketTaggingHandlerImpl = new NetworkPacketTaggingHandlerImpl( - PrivilegedOperationExecutor.getInstance(conf), - getInitializedCGroupsHandler(conf)); + PrivilegedOperationExecutor.getInstance(conf), cGroupV1Handler); } } } @@ -238,9 +266,10 @@ private static CGroupsBlkioResourceHandlerImpl getCgroupsBlkioResourceHandler( synchronized (DiskResourceHandler.class) { if (cGroupsBlkioResourceHandler == null) { LOG.debug("Creating new cgroups blkio handler"); + + initializeCGroupHandlers(conf); cGroupsBlkioResourceHandler = - new CGroupsBlkioResourceHandlerImpl( - getInitializedCGroupsHandler(conf)); + new CGroupsBlkioResourceHandlerImpl(cGroupV1Handler); } } } @@ -256,15 +285,19 @@ public static MemoryResourceHandler initMemoryResourceHandler( return null; } - private static CGroupsMemoryResourceHandlerImpl + private static MemoryResourceHandler getCgroupsMemoryResourceHandler( Configuration conf) throws ResourceHandlerException { if (cGroupsMemoryResourceHandler == null) { synchronized (MemoryResourceHandler.class) { if (cGroupsMemoryResourceHandler == null) { - cGroupsMemoryResourceHandler = - new CGroupsMemoryResourceHandlerImpl( - getInitializedCGroupsHandler(conf)); + + initializeCGroupHandlers(conf); + if (isMountedInCGroupsV2(CGroupsHandler.CGroupController.MEMORY)) { + cGroupsMemoryResourceHandler = new CGroupsV2MemoryResourceHandlerImpl(cGroupV2Handler); + } else { + cGroupsMemoryResourceHandler = new CGroupsMemoryResourceHandlerImpl(cGroupV1Handler); + } } } } @@ -326,15 +359,20 @@ private static void addHandlersFromConfiguredResourcePlugins( } for (ResourcePlugin plugin : pluginMap.values()) { + initializeCGroupHandlers(conf); addHandlerIfNotNull(handlerList, plugin.createResourceHandler(nmContext, - getInitializedCGroupsHandler(conf), + cGroupV1Handler, PrivilegedOperationExecutor.getInstance(conf))); } } public static ResourceHandlerChain getConfiguredResourceHandlerChain( Configuration conf, Context nmContext) throws ResourceHandlerException { + cgroupsV2Enabled = + conf.getBoolean(YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_V2_ENABLED, + YarnConfiguration.DEFAULT_NM_LINUX_CONTAINER_CGROUPS_V2_ENABLED); + if (resourceHandlerChain == null) { synchronized (ResourceHandlerModule.class) { if (resourceHandlerChain == null) { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCGroupsV2HandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCGroupsV2HandlerImpl.java index b8d1fb238d1f7..1ec952be049ff 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCGroupsV2HandlerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCGroupsV2HandlerImpl.java @@ -210,6 +210,111 @@ public void testMtabParsing() throws Exception { Assert.assertEquals(parentDir.getAbsolutePath(), memoryDir); } + /* + * Create a mock mtab file with the following content for hybrid v1/v2: + * cgroup2 /path/to/parentV2Dir cgroup2 rw,nosuid,nodev,noexec,relatime,memory_recursiveprot 0 0 + * cgroup /path/to/parentDir/memory cgroup rw,nosuid,nodev,noexec,relatime,memory 0 0 + * + * Create the following cgroup hierarchy: + * + * parentDir + * __________________________________ + * / \ + * unified memory + * _________________________________________________ + * / \ \ + * cgroup.controllers cgroup.subtree_control test-hadoop-yarn (hierarchyDir) + * _________________ + * / \ + * cgroup.controllers cgroup.subtree_control + */ + public File createPremountedHybridCgroups(File v1ParentDir) + throws IOException { + File v2ParentDir = new File(v1ParentDir, "unified"); + + String mtabContent = + "cgroup " + v1ParentDir.getAbsolutePath() + "/memory" + + " cgroup rw,nosuid,nodev,noexec,relatime,memory 0 0\n" + + "cgroup2 " + v2ParentDir.getAbsolutePath() + + " cgroup2 rw,nosuid,nodev,noexec,relatime,memory_recursiveprot 0 0\n"; + + File mockMtab = createFileWithContent(v1ParentDir, UUID.randomUUID().toString(), mtabContent); + + String enabledV2Controllers = "cpuset cpu io hugetlb pids rdma misc\n"; + File controllersFile = createFileWithContent(v2ParentDir, + CGroupsHandler.CGROUP_CONTROLLERS_FILE, enabledV2Controllers); + + File subtreeControlFile = new File(v2ParentDir, CGroupsHandler.CGROUP_SUBTREE_CONTROL_FILE); + Assert.assertTrue("empty subtree_control file should be created", + subtreeControlFile.createNewFile()); + + File hierarchyDir = new File(v2ParentDir, hierarchy); + if (!hierarchyDir.mkdirs()) { + String message = "Could not create directory " + hierarchyDir.getAbsolutePath(); + throw new IOException(message); + } + hierarchyDir.deleteOnExit(); + + FileUtils.copyFile(controllersFile, new File(hierarchyDir, + CGroupsHandler.CGROUP_CONTROLLERS_FILE)); + FileUtils.copyFile(subtreeControlFile, new File(hierarchyDir, + CGroupsHandler.CGROUP_SUBTREE_CONTROL_FILE)); + + return mockMtab; + } + + @Test + public void testHybridMtabParsing() throws Exception { + // Initialize mtab and cgroup dir + File v1ParentDir = new File(tmpPath); + + File v2ParentDir = new File(v1ParentDir, "unified"); + Assert.assertTrue("temp dir should be created", v2ParentDir.mkdirs()); + v2ParentDir.deleteOnExit(); + + // create mock cgroup + File mockMtabFile = createPremountedHybridCgroups(v1ParentDir); + + // create memory cgroup for v1 + File memoryCgroup = new File(v1ParentDir, "memory"); + assertTrue("Directory should be created", memoryCgroup.mkdirs()); + + // init v1 and v2 handlers + CGroupsHandlerImpl cGroupsHandler = new CGroupsHandlerImpl( + createMountConfiguration(), + privilegedOperationExecutorMock, mockMtabFile.getAbsolutePath()); + CGroupsV2HandlerImpl cGroupsV2Handler = new CGroupsV2HandlerImpl( + createMountConfiguration(), + privilegedOperationExecutorMock, mockMtabFile.getAbsolutePath()); + + // Verify resource handlers that are enabled in v1 + Map> newMtab = + cGroupsHandler.parseMtab(mockMtabFile.getAbsolutePath()); + Map controllerv1Paths = + cGroupsHandler.initializeControllerPathsFromMtab( + newMtab); + + Assert.assertEquals(1, controllerv1Paths.size()); + assertTrue(controllerv1Paths + .containsKey(CGroupsHandler.CGroupController.MEMORY)); + String memoryDir = + controllerv1Paths.get(CGroupsHandler.CGroupController.MEMORY); + Assert.assertEquals(memoryCgroup.getAbsolutePath(), memoryDir); + + // Verify resource handlers that are enabled in v2 + newMtab = + cGroupsV2Handler.parseMtab(mockMtabFile.getAbsolutePath()); + Map controllerPaths = + cGroupsV2Handler.initializeControllerPathsFromMtab( + newMtab); + + Assert.assertEquals(3, controllerPaths.size()); + assertTrue(controllerPaths + .containsKey(CGroupsHandler.CGroupController.CPU)); + String cpuDir = controllerPaths.get(CGroupsHandler.CGroupController.CPU); + Assert.assertEquals(v2ParentDir.getAbsolutePath(), cpuDir); + } + @Test public void testManualCgroupSetting() throws Exception { YarnConfiguration conf = new YarnConfiguration(); @@ -217,11 +322,32 @@ public void testManualCgroupSetting() throws Exception { conf.set(YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_HIERARCHY, "/hadoop-yarn"); - File subCgroup = new File(tmpPath, "/hadoop-yarn"); + validateCgroupV2Controllers(conf, tmpPath); + } + + @Test + public void testManualHybridCgroupSetting() throws Exception { + String unifiedPath = tmpPath + "/unified"; + + YarnConfiguration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_MOUNT_PATH, tmpPath); + conf.set(YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_V2_MOUNT_PATH, unifiedPath); + conf.set(YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_HIERARCHY, + "/hadoop-yarn"); + + validateCgroupV1Controllers(conf, tmpPath); + validateCgroupV2Controllers(conf, unifiedPath); + } + + private void validateCgroupV2Controllers(YarnConfiguration conf, String mountPath) + throws Exception { + File baseCgroup = new File(mountPath); + File subCgroup = new File(mountPath, "/hadoop-yarn"); Assert.assertTrue("temp dir should be created", subCgroup.mkdirs()); subCgroup.deleteOnExit(); String enabledControllers = "cpuset cpu io memory hugetlb pids rdma misc\n"; + createFileWithContent(baseCgroup, CGroupsHandler.CGROUP_CONTROLLERS_FILE, enabledControllers); createFileWithContent(subCgroup, CGroupsHandler.CGROUP_CONTROLLERS_FILE, enabledControllers); File subtreeControlFile = new File(subCgroup.getAbsolutePath(), @@ -233,8 +359,8 @@ public void testManualCgroupSetting() throws Exception { cGroupsHandler.initializeCGroupController(CGroupsHandler.CGroupController.CPU); Assert.assertEquals("CPU cgroup path was not set", subCgroup.getAbsolutePath(), - new File(cGroupsHandler.getPathForCGroup( - CGroupsHandler.CGroupController.CPU, "")).getAbsolutePath()); + new File(cGroupsHandler.getPathForCGroup( + CGroupsHandler.CGroupController.CPU, "")).getAbsolutePath()); // Verify that the subtree control file was updated String subtreeControllersEnabledString = FileUtils.readFileToString(subtreeControlFile, @@ -274,4 +400,21 @@ public void testManualCgroupSetting() throws Exception { Assert.assertTrue("Controllers not enabled in subtree control file", cGroupsHandler.getValidCGroups().containsAll(subtreeControllersEnabled)); } + + private void validateCgroupV1Controllers(YarnConfiguration conf, String mountPath) + throws ResourceHandlerException { + File blkio = new File(new File(mountPath, "blkio"), "/hadoop-yarn"); + + Assert.assertTrue("temp dir should be created", blkio.mkdirs()); + + CGroupsHandlerImpl cGroupsv1Handler = new CGroupsHandlerImpl(conf, null); + cGroupsv1Handler.initializeCGroupController( + CGroupsHandler.CGroupController.BLKIO); + + Assert.assertEquals("BLKIO CGRoup path was not set", blkio.getAbsolutePath(), + new File(cGroupsv1Handler.getPathForCGroup( + CGroupsHandler.CGroupController.BLKIO, "")).getAbsolutePath()); + + FileUtils.deleteQuietly(blkio); + } } \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCgroupsV2MemoryResourceHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCgroupsV2MemoryResourceHandlerImpl.java new file mode 100644 index 0000000000000..064bc1a0ee532 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCgroupsV2MemoryResourceHandlerImpl.java @@ -0,0 +1,209 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.api.records.ExecutionType; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.security.ContainerTokenIdentifier; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.util.List; + +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +public class TestCgroupsV2MemoryResourceHandlerImpl { + + private CGroupsHandler mockCGroupsHandler; + private CGroupsV2MemoryResourceHandlerImpl cGroupsMemoryResourceHandler; + + @Before + public void setup() { + mockCGroupsHandler = mock(CGroupsHandler.class); + when(mockCGroupsHandler.getPathForCGroup(any(), any())).thenReturn("."); + cGroupsMemoryResourceHandler = + new CGroupsV2MemoryResourceHandlerImpl(mockCGroupsHandler); + } + + @Test + public void testBootstrap() throws Exception { + Configuration conf = new YarnConfiguration(); + conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false); + conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false); + List ret = + cGroupsMemoryResourceHandler.bootstrap(conf); + verify(mockCGroupsHandler, times(1)) + .initializeCGroupController(CGroupsHandler.CGroupController.MEMORY); + Assert.assertNull(ret); + conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, true); + try { + cGroupsMemoryResourceHandler.bootstrap(conf); + } catch (ResourceHandlerException re) { + Assert.fail("Pmem check should be allowed to run with cgroups"); + } + conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false); + conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, true); + try { + cGroupsMemoryResourceHandler.bootstrap(conf); + } catch (ResourceHandlerException re) { + Assert.fail("Vmem check should be allowed to run with cgroups"); + } + } + + @Test + public void testPreStart() throws Exception { + Configuration conf = new Configuration(); + conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false); + conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false); + cGroupsMemoryResourceHandler.bootstrap(conf); + String id = "container_01_01"; + String path = "test-path/" + id; + ContainerId mockContainerId = mock(ContainerId.class); + when(mockContainerId.toString()).thenReturn(id); + Container mockContainer = mock(Container.class); + when(mockContainer.getContainerId()).thenReturn(mockContainerId); + when(mockCGroupsHandler + .getPathForCGroupTasks(CGroupsHandler.CGroupController.MEMORY, id)) + .thenReturn(path); + int memory = 1024; + when(mockContainer.getResource()) + .thenReturn(Resource.newInstance(memory, 1)); + List ret = + cGroupsMemoryResourceHandler.preStart(mockContainer); + verify(mockCGroupsHandler, times(1)) + .createCGroup(CGroupsHandler.CGroupController.MEMORY, id); + verify(mockCGroupsHandler, times(1)) + .updateCGroupParam(CGroupsHandler.CGroupController.MEMORY, id, + CGroupsHandler.CGROUP_MEMORY_MAX, + String.valueOf(memory) + "M"); + verify(mockCGroupsHandler, times(1)) + .updateCGroupParam(CGroupsHandler.CGroupController.MEMORY, id, + CGroupsHandler.CGROUP_MEMORY_LOW, + String.valueOf((int) (memory * 0.9)) + "M"); + Assert.assertNotNull(ret); + Assert.assertEquals(1, ret.size()); + PrivilegedOperation op = ret.get(0); + Assert.assertEquals(PrivilegedOperation.OperationType.ADD_PID_TO_CGROUP, + op.getOperationType()); + List args = op.getArguments(); + Assert.assertEquals(1, args.size()); + Assert.assertEquals(PrivilegedOperation.CGROUP_ARG_PREFIX + path, + args.get(0)); + } + + @Test + public void testPreStartNonEnforced() throws Exception { + Configuration conf = new Configuration(); + conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false); + conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false); + conf.setBoolean(YarnConfiguration.NM_MEMORY_RESOURCE_ENFORCED, false); + cGroupsMemoryResourceHandler.bootstrap(conf); + String id = "container_01_01"; + String path = "test-path/" + id; + ContainerId mockContainerId = mock(ContainerId.class); + when(mockContainerId.toString()).thenReturn(id); + Container mockContainer = mock(Container.class); + when(mockContainer.getContainerId()).thenReturn(mockContainerId); + when(mockCGroupsHandler + .getPathForCGroupTasks(CGroupsHandler.CGroupController.MEMORY, id)) + .thenReturn(path); + int memory = 1024; + when(mockContainer.getResource()) + .thenReturn(Resource.newInstance(memory, 1)); + List ret = + cGroupsMemoryResourceHandler.preStart(mockContainer); + verify(mockCGroupsHandler, times(1)) + .createCGroup(CGroupsHandler.CGroupController.MEMORY, id); + verify(mockCGroupsHandler, times(0)) + .updateCGroupParam(CGroupsHandler.CGroupController.MEMORY, id, + CGroupsHandler.CGROUP_MEMORY_MAX, + String.valueOf(memory) + "M"); + verify(mockCGroupsHandler, times(0)) + .updateCGroupParam(CGroupsHandler.CGroupController.MEMORY, id, + CGroupsHandler.CGROUP_MEMORY_LOW, + String.valueOf((int) (memory * 0.9)) + "M"); + Assert.assertNotNull(ret); + Assert.assertEquals(1, ret.size()); + PrivilegedOperation op = ret.get(0); + Assert.assertEquals(PrivilegedOperation.OperationType.ADD_PID_TO_CGROUP, + op.getOperationType()); + List args = op.getArguments(); + Assert.assertEquals(1, args.size()); + Assert.assertEquals(PrivilegedOperation.CGROUP_ARG_PREFIX + path, + args.get(0)); + } + + @Test + public void testReacquireContainer() throws Exception { + ContainerId containerIdMock = mock(ContainerId.class); + Assert.assertNull( + cGroupsMemoryResourceHandler.reacquireContainer(containerIdMock)); + } + + @Test + public void testPostComplete() throws Exception { + String id = "container_01_01"; + ContainerId mockContainerId = mock(ContainerId.class); + when(mockContainerId.toString()).thenReturn(id); + Assert + .assertNull(cGroupsMemoryResourceHandler.postComplete(mockContainerId)); + verify(mockCGroupsHandler, times(1)) + .deleteCGroup(CGroupsHandler.CGroupController.MEMORY, id); + } + + @Test + public void testTeardown() throws Exception { + Assert.assertNull(cGroupsMemoryResourceHandler.teardown()); + } + + @Test + public void testOpportunistic() throws Exception { + Configuration conf = new YarnConfiguration(); + conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false); + conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false); + + cGroupsMemoryResourceHandler.bootstrap(conf); + ContainerTokenIdentifier tokenId = mock(ContainerTokenIdentifier.class); + when(tokenId.getExecutionType()).thenReturn(ExecutionType.OPPORTUNISTIC); + Container container = mock(Container.class); + String id = "container_01_01"; + ContainerId mockContainerId = mock(ContainerId.class); + when(mockContainerId.toString()).thenReturn(id); + when(container.getContainerId()).thenReturn(mockContainerId); + when(container.getContainerTokenIdentifier()).thenReturn(tokenId); + when(container.getResource()).thenReturn(Resource.newInstance(1024, 2)); + cGroupsMemoryResourceHandler.preStart(container); + verify(mockCGroupsHandler, times(1)) + .updateCGroupParam(CGroupsHandler.CGroupController.MEMORY, id, + CGroupsHandler.CGROUP_MEMORY_LOW, "0M"); + verify(mockCGroupsHandler, times(1)) + .updateCGroupParam(CGroupsHandler.CGroupController.MEMORY, id, + CGroupsHandler.CGROUP_MEMORY_MAX, "1024M"); + } +}