Skip to content

Commit

Permalink
HADOOP-19221. Consider AWSStatus500Exception recoverable
Browse files Browse the repository at this point in the history
* AWSStatus500Exception is now a recoverable exception
* section in troubleshooting on it
* and one on 503

+ checkstyle, test tweaking

Change-Id: Idda8c7eaceeeb1034dfc08ef3bf875be571f468d
  • Loading branch information
steveloughran committed Jul 23, 2024
1 parent 2aa1ef6 commit 2790d56
Show file tree
Hide file tree
Showing 8 changed files with 149 additions and 63 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
public final class ByteBufferInputStream extends InputStream {
private static final Logger LOG =
LoggerFactory.getLogger(DataBlocks.class);

/** Size of the buffer. */
private final int size;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,21 +22,20 @@

/**
* A 5xx response came back from a service.
* The 500 error considered retriable by the AWS SDK, which will have already
* <p>
* The 500 error is considered retryable by the AWS SDK, which will have already
* tried it {@code fs.s3a.attempts.maximum} times before reaching s3a
* code.
* How it handles other 5xx errors is unknown: S3A FS code will treat them
* as unrecoverable on the basis that they indicate some third-party store
* or gateway problem.
* <p>
* These are rare, but can occur; they are considered retryable.
* Note that HADOOP-19221 shows a failure condition where the
* SDK itself did not recover on retry from the error.
* Mitigation for the specific failure sequence is now in place.
*/
public class AWSStatus500Exception extends AWSServiceIOException {
public AWSStatus500Exception(String operation,
AwsServiceException cause) {
super(operation, cause);
}

@Override
public boolean retryable() {
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -228,15 +228,15 @@ protected Map<Class<? extends Exception>, RetryPolicy> createExceptionMap() {
// throttled requests are can be retried, always
policyMap.put(AWSServiceThrottledException.class, throttlePolicy);

// Status 5xx error code is an immediate failure
// Status 5xx error code has historically been treated as an immediate failure
// this is sign of a server-side problem, and while
// rare in AWS S3, it does happen on third party stores.
// (out of disk space, etc).
// by the time we get here, the aws sdk will have
// already retried.
// already retried, if it is configured to retry exceptions.
// there is specific handling for some 5XX codes (501, 503);
// this is for everything else
policyMap.put(AWSStatus500Exception.class, fail);
policyMap.put(AWSStatus500Exception.class, retryAwsClientExceptions);

// subclass of AWSServiceIOException whose cause is always S3Exception
policyMap.put(AWSS3IOException.class, retryIdempotentCalls);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -575,13 +575,18 @@ public SinglePendingCommit uploadFileToPendingCommit(File localFile,
numParts, length));
}

List<CompletedPart> parts = new ArrayList<>((int) numParts);

LOG.debug("File size is {}, number of parts to upload = {}",
length, numParts);

// Open the file to upload.
parts = uploadFileData(uploadId, localFile, destKey, progress, length, numParts, uploadPartSize);
List<CompletedPart> parts = uploadFileData(
uploadId,
localFile,
destKey,
progress,
length,
numParts,
uploadPartSize);

commitData.bindCommitData(parts);
statistics.commitUploaded(length);
Expand All @@ -608,15 +613,16 @@ public SinglePendingCommit uploadFileToPendingCommit(File localFile,
* Upload file data using content provider API.
* This a rewrite of the previous code to address HADOOP-19221;
* our own {@link UploadContentProviders} file content provider
* is used to upload part of a file.
* is used to upload each part of the file.
* @param uploadId upload ID
* @param localFile locally staged file
* @param destKey destination path
* @param progress progress callback
* @param length file length
* @param numParts number of parts to upload
* @param uploadPartSize max size of a part
* @throws IOException
* @return the ordered list of parts
* @throws IOException IO failure
*/
private List<CompletedPart> uploadFileData(
final String uploadId,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@

import org.apache.hadoop.fs.s3a.statistics.CountersAndGauges;
import org.apache.hadoop.fs.s3a.statistics.StatisticsFromAwsSdk;
import org.apache.hadoop.fs.statistics.StoreStatisticNames;

import static org.apache.hadoop.fs.s3a.Statistic.STORE_IO_REQUEST;
import static org.apache.hadoop.fs.s3a.Statistic.STORE_IO_RETRY;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ Common problems working with S3 are:
7. [Other Errors](#other)
8. [SDK Upgrade Warnings](#upgrade_warnings)

This document also includes some [best pactises](#best) to aid troubleshooting.
This document also includes some [best practises](#best) to aid troubleshooting.


Troubleshooting IAM Assumed Roles is covered in its
Expand Down Expand Up @@ -236,8 +236,60 @@ read requests are allowed, but operations which write to the bucket are denied.

Check the system clock.

### <a name="bad_request"></a> "Bad Request" exception when working with data stores in an AWS region other than us-eaast

### `Class does not implement software.amazon.awssdk.auth.credentials.AwsCredentialsProvider`

A credential provider listed in `fs.s3a.aws.credentials.provider` does not implement
the interface `software.amazon.awssdk.auth.credentials.AwsCredentialsProvider`.

```
InstantiationIOException: `s3a://stevel-gcs/': Class org.apache.hadoop.fs.s3a.S3ARetryPolicy does not implement
software.amazon.awssdk.auth.credentials.AwsCredentialsProvider (configuration key fs.s3a.aws.credentials.provider)
at org.apache.hadoop.fs.s3a.impl.InstantiationIOException.isNotInstanceOf(InstantiationIOException.java:128)
at org.apache.hadoop.fs.s3a.S3AUtils.getInstanceFromReflection(S3AUtils.java:604)
at org.apache.hadoop.fs.s3a.auth.CredentialProviderListFactory.createAWSV2CredentialProvider(CredentialProviderListFactory.java:299)
at org.apache.hadoop.fs.s3a.auth.CredentialProviderListFactory.buildAWSProviderList(CredentialProviderListFactory.java:245)
at org.apache.hadoop.fs.s3a.auth.CredentialProviderListFactory.createAWSCredentialProviderList(CredentialProviderListFactory.java:144)
at org.apache.hadoop.fs.s3a.S3AFileSystem.bindAWSClient(S3AFileSystem.java:971)
at org.apache.hadoop.fs.s3a.S3AFileSystem.initialize(S3AFileSystem.java:624)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3601)
at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:171)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3702)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3653)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:555)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:366)
```

There's two main causes

1. A class listed there is not an implementation of the interface.
Fix: review the settings and correct as appropriate.
1. A class listed there does implement the interface, but it has been loaded in a different
classloader, so the JVM does not consider it to be an implementation.
Fix: learn the entire JVM classloader model and see if you can then debug it.
Tip: having both the AWS Shaded SDK and individual AWS SDK modules on your classpath
may be a cause of this.

If you see this and you are trying to use the S3A connector with Spark, then the cause can
be that the isolated classloader used to load Hive classes is interfering with the S3A
connector's dynamic loading of `software.amazon.awssdk` classes. To fix this, declare that
the classes in the aws SDK are loaded from the same classloader which instantiated
the S3A FileSystem instance:

```
spark.sql.hive.metastore.sharedPrefixes software.amazon.awssdk.
```


## <a name="400_bad_request"></a> 400 Bad Request errors

S3 stores return HTTP status code 400 "Bad Request" when the client make a request which
the store considers invalid.

This is most commonly caused by signing errors

### <a name="bad_request"></a> "Bad Request" exception when working with data stores in an AWS region other than us-east


```
Expand Down Expand Up @@ -286,47 +338,37 @@ S3 region as `ca-central-1`.
</property>
```

### `Classdoes not implement software.amazon.awssdk.auth.credentials.AwsCredentialsProvider`

A credential provider listed in `fs.s3a.aws.credentials.provider` does not implement
the interface `software.amazon.awssdk.auth.credentials.AwsCredentialsProvider`.
### <a name="request_timeout"></a> 400 + RequestTimeout "Your socket connection to the server was not read from or written to within the timeout period"

```
InstantiationIOException: `s3a://stevel-gcs/': Class org.apache.hadoop.fs.s3a.S3ARetryPolicy does not implement software.amazon.awssdk.auth.credentials.AwsCredentialsProvider (configuration key fs.s3a.aws.credentials.provider)
at org.apache.hadoop.fs.s3a.impl.InstantiationIOException.isNotInstanceOf(InstantiationIOException.java:128)
at org.apache.hadoop.fs.s3a.S3AUtils.getInstanceFromReflection(S3AUtils.java:604)
at org.apache.hadoop.fs.s3a.auth.CredentialProviderListFactory.createAWSV2CredentialProvider(CredentialProviderListFactory.java:299)
at org.apache.hadoop.fs.s3a.auth.CredentialProviderListFactory.buildAWSProviderList(CredentialProviderListFactory.java:245)
at org.apache.hadoop.fs.s3a.auth.CredentialProviderListFactory.createAWSCredentialProviderList(CredentialProviderListFactory.java:144)
at org.apache.hadoop.fs.s3a.S3AFileSystem.bindAWSClient(S3AFileSystem.java:971)
at org.apache.hadoop.fs.s3a.S3AFileSystem.initialize(S3AFileSystem.java:624)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3601)
at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:171)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3702)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3653)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:555)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:366)
org.apache.hadoop.fs.s3a.AWSBadRequestException: upload part #1 upload ID KylDNXbmiZlZE5JI2aKVcVeA66ly:
software.amazon.awssdk.services.s3.model.S3Exception:
Your socket connection to the server was not read from or written to within the timeout period.
Idle connections will be closed.
(Service: S3, Status Code: 400, Request ID: TT17CRYF6HJH2G0Y, Extended Request ID: ...):
RequestTimeout:
Your socket connection to the server was not read from or written to within the timeout period.
Idle connections will be closed. (Service: S3, Status Code: 400, Request ID: TT17CRYF6HJH2G0Y, Extended Request ID: ...

Check failure on line 351 in hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md

View check run for this annotation

ASF Cloudbees Jenkins ci-hadoop / Apache Yetus

hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md#L351

blanks: end of line
```

There's two main causes
This is an obscure failure which was encountered as part of
[HADOOP-19221](https://issues.apache.org/jira/browse/HADOOP-19221) : an upload of part of a file could not
be succesfully retried after a failure was reported on the first attempt.

1. A class listed there is not an implementation of the interface.
Fix: review the settings and correct as appropriate.
1. A class listed there does implement the interface, but it has been loaded in a different
classloader, so the JVM does not consider it to be an implementation.
Fix: learn the entire JVM classloader model and see if you can then debug it.
Tip: having both the AWS Shaded SDK and individual AWS SDK modules on your classpath
may be a cause of this.
1. It was only encountered during uploading files via the Staging Committers
2. And is a regression in the V2 SDK.
3. This should have been addressed in the S3A connector.

If you see this and you are trying to use the S3A connector with Spark, then the cause can
be that the isolated classloader used to load Hive classes is interfering with the S3A
connector's dynamic loading of `software.amazon.awssdk` classes. To fix this, declare that
the classes in the aws SDK are loaded from the same classloader which instantiated
the S3A FileSystem instance:
* If it is encountered on a hadoop release with HADOOP-19221, then this is a regression -please report it.
* If it is encountered on a release without the fix, please upgrade.

It may be that the problem arises in the AWS SDK's "TransferManager", which is used for a
higher performance upload of data from the local fileystem. If this is the case. disable this feature:
```
spark.sql.hive.metastore.sharedPrefixes software.amazon.awssdk.
<property>
<name>fs.s3a.optimized.copy.from.local.enabled</name>
<value>false</value>
</property>
```

## <a name="access_denied"></a> "The security token included in the request is invalid"
Expand Down Expand Up @@ -501,7 +543,43 @@ endpoint and region like the following:
<value>${sts.region}</value>
</property>
```
## <a name="500_internal_error"></a> HTTP 500 status code "We encountered an internal error"

```
We encountered an internal error. Please try again.
(Service: S3, Status Code: 500, Request ID: <id>, Extended Request ID: <extended-id>)
```

The [status code 500](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500) indicates
the S3 store has reported an internal problem.
When raised by Amazon S3, we believe this is a rare sign of a problem within the S3 system
or another part of the cloud infrastructure on which it depends.
Retrying _should_ make it go away.

The 500 error is considered retryable by the AWS SDK, which will have already
tried it `fs.s3a.attempts.maximum` times before reaching the S3A client -which
will also retry.

If encountered against a third party store (the lack of an extended request ID always implies this),
then it may be a permanent server-side failure. Fix that.

* All HTTP status codes other than 503 (service unavailable) and 501 (unsupported) are
treated as 500 exceptions.
* The S3A Filesystem IOStatistics count the number of 500 errors received.

## <a name="503 Throttling"></a> HTTP 503 status code "slow down" or 429 "Too Many Requests"

The [status code 503](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/503)
is returned by AWS S3 when the IO rate limit of the bucket is reached.

Google's cloud storage returns the response [429 Too Many Requests](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429)
for the same situation.

The AWS S3 documentation [covers this and suggests mitigation strategies](https://repost.aws/knowledge-center/http-5xx-errors-s3).
Note that it can also be caused by throttling in the KMS bencryption subsystem if
SSE-KMS or DSSE-KMS is used to encrypt data.

Consult [performance - throttling](./performance.html#throttling) for details on throttling.

## <a name="connectivity"></a> Connectivity Problems

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ public void testBlockFactoryIO() throws Throwable {
int expected = bufferLen;
assertAvailableValue(stream, expected);

assertReadEquals(stream,'t');
assertReadEquals(stream, 't');

stream.mark(Integer.MAX_VALUE);
expected--;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
import org.apache.hadoop.fs.s3a.commit.impl.CommitOperations;
import org.apache.hadoop.fs.s3a.performance.AbstractS3ACostTest;

import static org.apache.hadoop.fs.contract.ContractTestUtils.verifyFileContents;
import static org.apache.hadoop.fs.s3a.Constants.DEFAULT_MULTIPART_SIZE;
import static org.apache.hadoop.fs.s3a.Constants.DIRECTORY_OPERATIONS_PURGE_UPLOADS;
import static org.apache.hadoop.fs.s3a.Constants.FAST_UPLOAD_BUFFER;
Expand Down Expand Up @@ -78,9 +79,9 @@
* <p>
* Fault injection is implemented in {@link FaultInjector}; this uses the evaluator
* function {@link #evaluator} to determine if the request type is that for which
* failures are targeted; for when there is a match then {@link #requestFailureCount}
* failures are targeted; for when there is a match then {@link #REQUEST_FAILURE_COUNT}
* is decremented and, if the count is still positive, an error is raised with the
* error code defined in {@link #failureStatusCode}.
* error code defined in {@link #FAILURE_STATUS_CODE}.
* This happens <i>after</i> the request has already succeeded against the S3 store.
*/
@RunWith(Parameterized.class)
Expand Down Expand Up @@ -117,14 +118,15 @@ public static Collection<Object[]> params() {
/**
* How many requests with the matching evaluator to fail on.
*/
public static final AtomicInteger requestFailureCount = new AtomicInteger(1);
public static final AtomicInteger REQUEST_FAILURE_COUNT = new AtomicInteger(1);

/**
* Evaluator for responses.
*/
private static Function<Context.ModifyHttpResponse, Boolean> evaluator;

private static final AtomicInteger failureStatusCode = new AtomicInteger(SC_500_INTERNAL_SERVER_ERROR);
private static final AtomicInteger FAILURE_STATUS_CODE =
new AtomicInteger(SC_500_INTERNAL_SERVER_ERROR);
/**
* should the commit test be included?
*/
Expand Down Expand Up @@ -153,12 +155,12 @@ private static void resetEvaluator() {
}

/**
* Set the failure count;
* @param count failure count;
* Set the failure count.
* @param count failure count
*/
private static void setRequestFailureCount(int count) {
LOG.debug("Failure count set to {}", count);
requestFailureCount.set(count);
REQUEST_FAILURE_COUNT.set(count);
}

/**
Expand Down Expand Up @@ -279,6 +281,8 @@ public void testCommitOperations() throws Throwable {
= actions.createCommitContextForTesting(dest, JOB_ID, 0)) {
commitContext.commitOrFail(commit);
}
// make sure the saved data is as expected
verifyFileContents(fs, dest, dataset);
}

/**
Expand Down Expand Up @@ -337,7 +341,7 @@ public SdkHttpResponse modifyHttpResponse(final Context.ModifyHttpResponse conte
LOG.info("reporting 500 error code for request {}", request);

return httpResponse.copy(b -> {
b.statusCode(failureStatusCode.get());
b.statusCode(FAILURE_STATUS_CODE.get());
});

} else {
Expand All @@ -351,6 +355,6 @@ public SdkHttpResponse modifyHttpResponse(final Context.ModifyHttpResponse conte
* @return true if the request count means a request must fail
*/
private static boolean shouldFail() {
return requestFailureCount.decrementAndGet() > 0;
return REQUEST_FAILURE_COUNT.decrementAndGet() > 0;
}
}

0 comments on commit 2790d56

Please sign in to comment.