Skip to content

Commit e110e2a

Browse files
author
Bhavay Pahuja
committed
HADOOP-14837: Glacier read restored objects support
1 parent 4f0f5a5 commit e110e2a

File tree

10 files changed

+358
-4
lines changed

10 files changed

+358
-4
lines changed

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1519,6 +1519,12 @@ private Constants() {
15191519
*/
15201520
public static final int DEFAULT_PREFETCH_MAX_BLOCKS_COUNT = 4;
15211521

1522+
/**
1523+
* Read Restored Glacier objects config.
1524+
* Value = {@value}
1525+
*/
1526+
public static final String READ_RESTORED_GLACIER_OBJECTS = "fs.s3a.glacier.read.restored.objects";
1527+
15221528
/**
15231529
* The bucket region header.
15241530
*/

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Listing.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ public class Listing extends AbstractStoreOperation {
7676

7777
private static final Logger LOG = S3AFileSystem.LOG;
7878
private final boolean isCSEEnabled;
79+
private final S3ObjectStorageClassFilter s3ObjectStorageClassFilter;
7980

8081
static final FileStatusAcceptor ACCEPT_ALL_BUT_S3N =
8182
new AcceptAllButS3nDirs();
@@ -87,6 +88,7 @@ public Listing(ListingOperationCallbacks listingOperationCallbacks,
8788
super(storeContext);
8889
this.listingOperationCallbacks = listingOperationCallbacks;
8990
this.isCSEEnabled = storeContext.isCSEEnabled();
91+
this.s3ObjectStorageClassFilter = storeContext.getS3ObjectsStorageClassFilter();
9092
}
9193

9294
/**
@@ -462,7 +464,8 @@ private boolean buildNextStatusBatch(S3ListResult objects) {
462464
LOG.debug("{}: {}", keyPath, stringify(s3Object));
463465
}
464466
// Skip over keys that are ourselves and old S3N _$folder$ files
465-
if (acceptor.accept(keyPath, s3Object) && filter.accept(keyPath)) {
467+
// Handle Glacier Storage Class objects based on the config fs.s3a.glacier.read.restored.objects value set
468+
if ( s3ObjectStorageClassFilter.getFilter().apply(s3Object) && acceptor.accept(keyPath, s3Object) && filter.accept(keyPath)) {
466469
S3AFileStatus status = createFileStatus(keyPath, s3Object,
467470
listingOperationCallbacks.getDefaultBlockSize(keyPath),
468471
getStoreContext().getUsername(),

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,12 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities,
441441
*/
442442
private boolean isCSEEnabled;
443443

444+
/**
445+
* {@link S3ObjectStorageClassFilter} will filter the S3 files based on the
446+
* {@code fs.s3a.glacier.read.restored.objects} configuration.
447+
*/
448+
private S3ObjectStorageClassFilter s3ObjectStorageClassFilter;
449+
444450
/**
445451
* Bucket AccessPoint.
446452
*/
@@ -577,6 +583,11 @@ public void initialize(URI name, Configuration originalConf)
577583

578584
s3aInternals = createS3AInternals();
579585

586+
s3ObjectStorageClassFilter = Optional.of(conf.getTrimmed(READ_RESTORED_GLACIER_OBJECTS,
587+
S3ObjectStorageClassFilter.READ_ALL.toString()))
588+
.map(String::toUpperCase)
589+
.map(S3ObjectStorageClassFilter::valueOf).get();
590+
580591
// look for encryption data
581592
// DT Bindings may override this
582593
setEncryptionSecrets(
@@ -5658,6 +5669,7 @@ public StoreContext createStoreContext() {
56585669
.setContextAccessors(new ContextAccessorsImpl())
56595670
.setAuditor(getAuditor())
56605671
.setEnableCSE(isCSEEnabled)
5672+
.setS3ObjectStorageClassFilter(s3ObjectStorageClassFilter)
56615673
.build();
56625674
}
56635675

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.hadoop.fs.s3a;
20+
21+
import org.apache.hadoop.thirdparty.com.google.common.collect.Sets;
22+
import java.util.Set;
23+
import java.util.function.Function;
24+
import software.amazon.awssdk.services.s3.model.ObjectStorageClass;
25+
import software.amazon.awssdk.services.s3.model.S3Object;
26+
27+
28+
/**
29+
* <pre>
30+
* {@link S3ObjectStorageClassFilter} will filter the S3 files based on the {@code fs.s3a.glacier.read.restored.objects} configuration set in {@link S3AFileSystem}
31+
* The config can have 3 values:
32+
* {@code READ_ALL}: Retrieval of Glacier files will fail with InvalidObjectStateException: The operation is not valid for the object's storage class.
33+
* {@code SKIP_ALL_GLACIER}: If this value is set then this will ignore any S3 Objects which are tagged with Glacier storage classes and retrieve the others.
34+
* {@code READ_RESTORED_GLACIER_OBJECTS}: If this value is set then restored status of the Glacier object will be checked, if restored the objects would be read like normal S3 objects else they will be ignored as the objects would not have been retrieved from the S3 Glacier.
35+
* </pre>
36+
*/
37+
public enum S3ObjectStorageClassFilter {
38+
READ_ALL(o -> true),
39+
SKIP_ALL_GLACIER(S3ObjectStorageClassFilter::isNotGlacierObject),
40+
READ_RESTORED_GLACIER_OBJECTS(S3ObjectStorageClassFilter::isCompletedRestoredObject);
41+
42+
private static final Set<ObjectStorageClass> GLACIER_STORAGE_CLASSES = Sets.newHashSet(ObjectStorageClass.GLACIER, ObjectStorageClass.DEEP_ARCHIVE);
43+
44+
private final Function<S3Object, Boolean> filter;
45+
46+
S3ObjectStorageClassFilter(Function<S3Object, Boolean> filter) {
47+
this.filter = filter;
48+
}
49+
50+
/**
51+
* Checks if the s3 object is not an object with a storage class of glacier/deep_archive
52+
* @param object s3 object
53+
* @return if the s3 object is not an object with a storage class of glacier/deep_archive
54+
*/
55+
private static boolean isNotGlacierObject(S3Object object) {
56+
return !GLACIER_STORAGE_CLASSES.contains(object.storageClass());
57+
}
58+
59+
/**
60+
* Checks if the s3 object is an object with a storage class of glacier/deep_archive
61+
* @param object s3 object
62+
* @return if the s3 object is an object with a storage class of glacier/deep_archive
63+
*/
64+
private static boolean isGlacierObject(S3Object object) {
65+
return GLACIER_STORAGE_CLASSES.contains(object.storageClass());
66+
}
67+
68+
/**
69+
* Checks if the s3 object is completely restored
70+
* @param object s3 object
71+
* @return if the s3 object is completely restored
72+
*/
73+
private static boolean isCompletedRestoredObject(S3Object object) {
74+
if(isGlacierObject(object)) {
75+
return object.restoreStatus() != null && !object.restoreStatus().isRestoreInProgress();
76+
}
77+
return true;
78+
}
79+
80+
public Function<S3Object, Boolean> getFilter() {
81+
return filter;
82+
}
83+
84+
}

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/api/RequestFactory.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,9 @@
3737
import software.amazon.awssdk.services.s3.model.ListObjectsV2Request;
3838
import software.amazon.awssdk.services.s3.model.ObjectIdentifier;
3939
import software.amazon.awssdk.services.s3.model.PutObjectRequest;
40+
import software.amazon.awssdk.services.s3.model.RestoreObjectRequest;
4041
import software.amazon.awssdk.services.s3.model.StorageClass;
42+
import software.amazon.awssdk.services.s3.model.Tier;
4143
import software.amazon.awssdk.services.s3.model.UploadPartRequest;
4244

4345
import org.apache.hadoop.classification.InterfaceAudience;
@@ -251,4 +253,15 @@ ListObjectsV2Request.Builder newListObjectsV2RequestBuilder(String key,
251253
DeleteObjectsRequest.Builder newBulkDeleteRequestBuilder(
252254
List<ObjectIdentifier> keysToDelete);
253255

256+
/**
257+
* Create a request builder to initiate a restore of Glacier object
258+
* @param key object to restore
259+
* @param tier glacier retrieval tier at which the restore will be processed.
260+
* @param expirationDays lifetime of the active restored copy in days.
261+
* @return the request builder
262+
*/
263+
RestoreObjectRequest.Builder newRestoreObjectRequestBuilder(String key,
264+
Tier tier,
265+
int expirationDays);
266+
254267
}

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/RequestFactoryImpl.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import software.amazon.awssdk.services.s3.model.DeleteObjectsRequest;
3535
import software.amazon.awssdk.services.s3.model.GetObjectRequest;
3636
import software.amazon.awssdk.services.s3.model.CreateMultipartUploadRequest;
37+
import software.amazon.awssdk.services.s3.model.GlacierJobParameters;
3738
import software.amazon.awssdk.services.s3.model.HeadBucketRequest;
3839
import software.amazon.awssdk.services.s3.model.HeadObjectRequest;
3940
import software.amazon.awssdk.services.s3.model.HeadObjectResponse;
@@ -42,9 +43,13 @@
4243
import software.amazon.awssdk.services.s3.model.ListObjectsV2Request;
4344
import software.amazon.awssdk.services.s3.model.MetadataDirective;
4445
import software.amazon.awssdk.services.s3.model.ObjectIdentifier;
46+
import software.amazon.awssdk.services.s3.model.OptionalObjectAttributes;
4547
import software.amazon.awssdk.services.s3.model.PutObjectRequest;
48+
import software.amazon.awssdk.services.s3.model.RestoreObjectRequest;
49+
import software.amazon.awssdk.services.s3.model.RestoreRequest;
4650
import software.amazon.awssdk.services.s3.model.ServerSideEncryption;
4751
import software.amazon.awssdk.services.s3.model.StorageClass;
52+
import software.amazon.awssdk.services.s3.model.Tier;
4853
import software.amazon.awssdk.services.s3.model.UploadPartRequest;
4954
import software.amazon.awssdk.utils.Md5Utils;
5055
import org.apache.hadoop.util.Preconditions;
@@ -609,6 +614,7 @@ public ListObjectsV2Request.Builder newListObjectsV2RequestBuilder(
609614
final ListObjectsV2Request.Builder requestBuilder = ListObjectsV2Request.builder()
610615
.bucket(bucket)
611616
.maxKeys(maxKeys)
617+
.optionalObjectAttributes(OptionalObjectAttributes.RESTORE_STATUS) // Optional Attribute to get the Restored Status of the Glacier Objects
612618
.prefix(key);
613619

614620
if (delimiter != null) {
@@ -632,6 +638,21 @@ public DeleteObjectsRequest.Builder newBulkDeleteRequestBuilder(
632638
.delete(d -> d.objects(keysToDelete).quiet(!LOG.isTraceEnabled())));
633639
}
634640

641+
@Override
642+
public RestoreObjectRequest.Builder newRestoreObjectRequestBuilder(String key,
643+
Tier tier,
644+
int expirationDays) {
645+
return prepareRequest(RestoreObjectRequest
646+
.builder()
647+
.bucket(bucket)
648+
.key(key)
649+
.restoreRequest(RestoreRequest
650+
.builder()
651+
.days(expirationDays)
652+
.glacierJobParameters(GlacierJobParameters.builder().tier(tier).build())
653+
.build()));
654+
}
655+
635656
@Override
636657
public void setEncryptionSecrets(final EncryptionSecrets secrets) {
637658
encryptionSecrets = secrets;

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/StoreContext.java

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
import java.util.concurrent.ExecutorService;
2727

2828
import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.ListeningExecutorService;
29-
3029
import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.MoreExecutors;
3130
import org.apache.hadoop.classification.InterfaceAudience;
3231
import org.apache.hadoop.classification.InterfaceStability;
@@ -38,6 +37,7 @@
3837
import org.apache.hadoop.fs.s3a.S3AFileStatus;
3938
import org.apache.hadoop.fs.s3a.S3AInputPolicy;
4039
import org.apache.hadoop.fs.s3a.S3AStorageStatistics;
40+
import org.apache.hadoop.fs.s3a.S3ObjectStorageClassFilter;
4141
import org.apache.hadoop.fs.s3a.Statistic;
4242
import org.apache.hadoop.fs.s3a.statistics.S3AStatisticsContext;
4343
import org.apache.hadoop.fs.store.audit.ActiveThreadSpanSource;
@@ -117,6 +117,8 @@ public class StoreContext implements ActiveThreadSpanSource<AuditSpan> {
117117
/** Is client side encryption enabled? */
118118
private final boolean isCSEEnabled;
119119

120+
private final S3ObjectStorageClassFilter s3ObjectStorageClassFilter;
121+
120122
/**
121123
* Instantiate.
122124
*/
@@ -137,7 +139,8 @@ public class StoreContext implements ActiveThreadSpanSource<AuditSpan> {
137139
final boolean useListV1,
138140
final ContextAccessors contextAccessors,
139141
final AuditSpanSource<AuditSpanS3A> auditor,
140-
final boolean isCSEEnabled) {
142+
final boolean isCSEEnabled,
143+
final S3ObjectStorageClassFilter s3ObjectStorageClassFilter) {
141144
this.fsURI = fsURI;
142145
this.bucket = bucket;
143146
this.configuration = configuration;
@@ -158,6 +161,7 @@ public class StoreContext implements ActiveThreadSpanSource<AuditSpan> {
158161
this.contextAccessors = contextAccessors;
159162
this.auditor = auditor;
160163
this.isCSEEnabled = isCSEEnabled;
164+
this.s3ObjectStorageClassFilter = s3ObjectStorageClassFilter;
161165
}
162166

163167
public URI getFsURI() {
@@ -411,4 +415,12 @@ public RequestFactory getRequestFactory() {
411415
public boolean isCSEEnabled() {
412416
return isCSEEnabled;
413417
}
418+
419+
/**
420+
* Return the S3ObjectStorageClassFilter object for S3A, whose value is set according to the config {@code fs.s3a.glacier.read.restored.objects}
421+
* @return {@link S3ObjectStorageClassFilter} object
422+
*/
423+
public S3ObjectStorageClassFilter getS3ObjectsStorageClassFilter() {
424+
return s3ObjectStorageClassFilter;
425+
}
414426
}

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/StoreContextBuilder.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import org.apache.hadoop.fs.s3a.Invoker;
2626
import org.apache.hadoop.fs.s3a.S3AInputPolicy;
2727
import org.apache.hadoop.fs.s3a.S3AStorageStatistics;
28+
import org.apache.hadoop.fs.s3a.S3ObjectStorageClassFilter;
2829
import org.apache.hadoop.fs.s3a.audit.AuditSpanS3A;
2930
import org.apache.hadoop.fs.s3a.statistics.S3AStatisticsContext;
3031
import org.apache.hadoop.fs.store.audit.AuditSpanSource;
@@ -69,6 +70,8 @@ public class StoreContextBuilder {
6970

7071
private boolean isCSEEnabled;
7172

73+
private S3ObjectStorageClassFilter s3ObjectStorageClassFilter;
74+
7275
public StoreContextBuilder setFsURI(final URI fsURI) {
7376
this.fsURI = fsURI;
7477
return this;
@@ -175,6 +178,12 @@ public StoreContextBuilder setEnableCSE(
175178
return this;
176179
}
177180

181+
public StoreContextBuilder setS3ObjectStorageClassFilter(
182+
S3ObjectStorageClassFilter value) {
183+
s3ObjectStorageClassFilter = value;
184+
return this;
185+
}
186+
178187
public StoreContext build() {
179188
return new StoreContext(fsURI,
180189
bucket,
@@ -192,6 +201,7 @@ public StoreContext build() {
192201
useListV1,
193202
contextAccessors,
194203
auditor,
195-
isCSEEnabled);
204+
isCSEEnabled,
205+
s3ObjectStorageClassFilter);
196206
}
197207
}

hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1285,6 +1285,20 @@ The switch to turn S3A auditing on or off.
12851285
</description>
12861286
</property>
12871287

1288+
<!--
1289+
The switch to control how S3A handles glacier storage classes.
1290+
-->
1291+
<property>
1292+
<name>fs.s3a.glacier.read.restored.objects</name>
1293+
<value>READ_ALL</value>
1294+
<description>
1295+
The config can have 3 values:
1296+
1297+
* READ_ALL: Retrieval of Glacier files will fail with InvalidObjectStateException: The operation is not valid for the object's storage class.
1298+
* SKIP_ALL_GLACIER: If this value is set then this will ignore any S3 Objects which are tagged with Glacier storage classes and retrieve the others.
1299+
* READ_RESTORED_GLACIER_OBJECTS: If this value is set then restored status of the Glacier object will be checked, if restored the objects would be read like normal S3 objects else they will be ignored as the objects would not have been retrieved from the S3 Glacier.
1300+
</description>
1301+
</property>
12881302
```
12891303
## <a name="retry_and_recovery"></a>Retry and Recovery
12901304

0 commit comments

Comments
 (0)