Skip to content

Commit df41975

Browse files
authored
HADOOP-18029: Update CompressionCodecFactory to handle uppercase file extensions (#3739)
Co-authored-by: Desmond Sisson <sissonde@amazon.com>
1 parent 4dea4a7 commit df41975

File tree

2 files changed

+110
-79
lines changed

2 files changed

+110
-79
lines changed

hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/CompressionCodecFactory.java

Lines changed: 74 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ public class CompressionCodecFactory {
4040
LoggerFactory.getLogger(CompressionCodecFactory.class.getName());
4141

4242
private static final ServiceLoader<CompressionCodec> CODEC_PROVIDERS =
43-
ServiceLoader.load(CompressionCodec.class);
43+
ServiceLoader.load(CompressionCodec.class);
4444

4545
/**
4646
* A map from the reversed filename suffixes to the codecs.
@@ -49,15 +49,15 @@ public class CompressionCodecFactory {
4949
*/
5050
private SortedMap<String, CompressionCodec> codecs = null;
5151

52-
/**
53-
* A map from the reversed filename suffixes to the codecs.
54-
* This is probably overkill, because the maps should be small, but it
55-
* automatically supports finding the longest matching suffix.
56-
*/
57-
private Map<String, CompressionCodec> codecsByName = null;
52+
/**
53+
* A map from the reversed filename suffixes to the codecs.
54+
* This is probably overkill, because the maps should be small, but it
55+
* automatically supports finding the longest matching suffix.
56+
*/
57+
private Map<String, CompressionCodec> codecsByName = null;
5858

5959
/**
60-
* A map from class names to the codecs
60+
* A map from class names to the codecs.
6161
*/
6262
private HashMap<String, CompressionCodec> codecsByClassName = null;
6363

@@ -80,8 +80,8 @@ private void addCodec(CompressionCodec codec) {
8080
@Override
8181
public String toString() {
8282
StringBuilder buf = new StringBuilder();
83-
Iterator<Map.Entry<String, CompressionCodec>> itr =
84-
codecs.entrySet().iterator();
83+
Iterator<Map.Entry<String, CompressionCodec>> itr =
84+
codecs.entrySet().iterator();
8585
buf.append("{ ");
8686
if (itr.hasNext()) {
8787
Map.Entry<String, CompressionCodec> entry = itr.next();
@@ -110,8 +110,8 @@ public String toString() {
110110
*/
111111
public static List<Class<? extends CompressionCodec>> getCodecClasses(
112112
Configuration conf) {
113-
List<Class<? extends CompressionCodec>> result
114-
= new ArrayList<Class<? extends CompressionCodec>>();
113+
List<Class<? extends CompressionCodec>> result =
114+
new ArrayList<Class<? extends CompressionCodec>>();
115115
// Add codec classes discovered via service loading
116116
synchronized (CODEC_PROVIDERS) {
117117
// CODEC_PROVIDERS is a lazy collection. Synchronize so it is
@@ -200,11 +200,13 @@ public CompressionCodec getCodec(Path file) {
200200
String filename = file.getName();
201201
String reversedFilename =
202202
new StringBuilder(filename).reverse().toString();
203-
SortedMap<String, CompressionCodec> subMap =
204-
codecs.headMap(reversedFilename);
203+
String lowerReversedFilename =
204+
StringUtils.toLowerCase(reversedFilename);
205+
SortedMap<String, CompressionCodec> subMap =
206+
codecs.headMap(lowerReversedFilename);
205207
if (!subMap.isEmpty()) {
206208
String potentialSuffix = subMap.lastKey();
207-
if (reversedFilename.startsWith(potentialSuffix)) {
209+
if (lowerReversedFilename.startsWith(potentialSuffix)) {
208210
result = codecs.get(potentialSuffix);
209211
}
210212
}
@@ -224,57 +226,57 @@ public CompressionCodec getCodecByClassName(String classname) {
224226
return codecsByClassName.get(classname);
225227
}
226228

227-
/**
228-
* Find the relevant compression codec for the codec's canonical class name
229-
* or by codec alias.
230-
* <p>
231-
* Codec aliases are case insensitive.
232-
* <p>
233-
* The code alias is the short class name (without the package name).
234-
* If the short class name ends with 'Codec', then there are two aliases for
235-
* the codec, the complete short class name and the short class name without
236-
* the 'Codec' ending. For example for the 'GzipCodec' codec class name the
237-
* alias are 'gzip' and 'gzipcodec'.
238-
*
239-
* @param codecName the canonical class name of the codec
240-
* @return the codec object
241-
*/
242-
public CompressionCodec getCodecByName(String codecName) {
243-
if (codecsByClassName == null) {
244-
return null;
245-
}
246-
CompressionCodec codec = getCodecByClassName(codecName);
247-
if (codec == null) {
248-
// trying to get the codec by name in case the name was specified
249-
// instead a class
250-
codec = codecsByName.get(StringUtils.toLowerCase(codecName));
251-
}
252-
return codec;
229+
/**
230+
* Find the relevant compression codec for the codec's canonical class name
231+
* or by codec alias.
232+
* <p>
233+
* Codec aliases are case insensitive.
234+
* <p>
235+
* The code alias is the short class name (without the package name).
236+
* If the short class name ends with 'Codec', then there are two aliases for
237+
* the codec, the complete short class name and the short class name without
238+
* the 'Codec' ending. For example for the 'GzipCodec' codec class name the
239+
* alias are 'gzip' and 'gzipcodec'.
240+
*
241+
* @param codecName the canonical class name of the codec
242+
* @return the codec object
243+
*/
244+
public CompressionCodec getCodecByName(String codecName) {
245+
if (codecsByClassName == null) {
246+
return null;
253247
}
248+
CompressionCodec codec = getCodecByClassName(codecName);
249+
if (codec == null) {
250+
// trying to get the codec by name in case the name was specified
251+
// instead a class
252+
codec = codecsByName.get(StringUtils.toLowerCase(codecName));
253+
}
254+
return codec;
255+
}
254256

255-
/**
256-
* Find the relevant compression codec for the codec's canonical class name
257-
* or by codec alias and returns its implemetation class.
258-
* <p>
259-
* Codec aliases are case insensitive.
260-
* <p>
261-
* The code alias is the short class name (without the package name).
262-
* If the short class name ends with 'Codec', then there are two aliases for
263-
* the codec, the complete short class name and the short class name without
264-
* the 'Codec' ending. For example for the 'GzipCodec' codec class name the
265-
* alias are 'gzip' and 'gzipcodec'.
266-
*
267-
* @param codecName the canonical class name of the codec
268-
* @return the codec class
269-
*/
270-
public Class<? extends CompressionCodec> getCodecClassByName(
271-
String codecName) {
272-
CompressionCodec codec = getCodecByName(codecName);
273-
if (codec == null) {
274-
return null;
275-
}
276-
return codec.getClass();
257+
/**
258+
* Find the relevant compression codec for the codec's canonical class name
259+
* or by codec alias and returns its implemetation class.
260+
* <p>
261+
* Codec aliases are case insensitive.
262+
* <p>
263+
* The code alias is the short class name (without the package name).
264+
* If the short class name ends with 'Codec', then there are two aliases for
265+
* the codec, the complete short class name and the short class name without
266+
* the 'Codec' ending. For example for the 'GzipCodec' codec class name the
267+
* alias are 'gzip' and 'gzipcodec'.
268+
*
269+
* @param codecName the canonical class name of the codec
270+
* @return the codec class
271+
*/
272+
public Class<? extends CompressionCodec> getCodecClassByName(
273+
String codecName) {
274+
CompressionCodec codec = getCodecByName(codecName);
275+
if (codec == null) {
276+
return null;
277277
}
278+
return codec.getClass();
279+
}
278280

279281
/**
280282
* Removes a suffix from a filename, if it has it.
@@ -323,8 +325,12 @@ public static void main(String[] args) throws Exception {
323325
len = in.read(buffer);
324326
}
325327
} finally {
326-
if(out != null) { out.close(); }
327-
if(in != null) { in.close(); }
328+
if(out != null) {
329+
out.close();
330+
}
331+
if(in != null) {
332+
in.close();
333+
}
328334
}
329335
} else {
330336
CompressionInputStream in = null;
@@ -338,7 +344,9 @@ public static void main(String[] args) throws Exception {
338344
len = in.read(buffer);
339345
}
340346
} finally {
341-
if(in != null) { in.close(); }
347+
if(in != null) {
348+
in.close();
349+
}
342350
}
343351
}
344352
}

hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/compress/TestCodecFactory.java

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
import org.junit.Test;
3131
import static org.junit.Assert.assertEquals;
32+
import static org.junit.Assert.assertNull;
3233
import static org.junit.Assert.fail;
3334

3435
public class TestCodecFactory {
@@ -45,8 +46,8 @@ public Configuration getConf() {
4546
}
4647

4748
@Override
48-
public CompressionOutputStream createOutputStream(OutputStream out)
49-
throws IOException {
49+
public CompressionOutputStream createOutputStream(OutputStream out)
50+
throws IOException {
5051
return null;
5152
}
5253

@@ -62,21 +63,21 @@ public Compressor createCompressor() {
6263

6364
@Override
6465
public CompressionInputStream createInputStream(InputStream in,
65-
Decompressor decompressor)
66-
throws IOException {
66+
Decompressor decompressor)
67+
throws IOException {
6768
return null;
6869
}
6970

7071
@Override
71-
public CompressionInputStream createInputStream(InputStream in)
72-
throws IOException {
72+
public CompressionInputStream createInputStream(InputStream in)
73+
throws IOException {
7374
return null;
7475
}
7576

7677
@Override
7778
public CompressionOutputStream createOutputStream(OutputStream out,
78-
Compressor compressor)
79-
throws IOException {
79+
Compressor compressor)
80+
throws IOException {
8081
return null;
8182
}
8283

@@ -125,7 +126,7 @@ public String getDefaultExtension() {
125126
}
126127

127128
/**
128-
* Returns a factory for a given set of codecs
129+
* Returns a factory for a given set of codecs.
129130
* @param classes the codec classes to include
130131
* @return a new factory
131132
*/
@@ -137,22 +138,30 @@ private static CompressionCodecFactory setClasses(Class[] classes) {
137138

138139
private static void checkCodec(String msg,
139140
Class expected, CompressionCodec actual) {
140-
assertEquals(msg + " unexpected codec found",
141-
expected.getName(),
142-
actual.getClass().getName());
141+
if (expected == null) {
142+
assertNull(msg, actual);
143+
} else if (actual == null) {
144+
fail(msg + " result was null");
145+
} else {
146+
assertEquals(msg + " unexpected codec found",
147+
expected.getName(),
148+
actual.getClass().getName());
149+
}
143150
}
144151

145152
@Test
146153
public void testFinding() {
147154
CompressionCodecFactory factory =
148-
new CompressionCodecFactory(new Configuration());
155+
new CompressionCodecFactory(new Configuration());
149156
CompressionCodec codec = factory.getCodec(new Path("/tmp/foo.bar"));
150157
assertEquals("default factory foo codec", null, codec);
151158
codec = factory.getCodecByClassName(BarCodec.class.getCanonicalName());
152159
assertEquals("default factory foo codec", null, codec);
153160

154161
codec = factory.getCodec(new Path("/tmp/foo.gz"));
155162
checkCodec("default factory for .gz", GzipCodec.class, codec);
163+
codec = factory.getCodec(new Path("/tmp/foo.GZ"));
164+
checkCodec("default factory for .GZ", GzipCodec.class, codec);
156165
codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName());
157166
checkCodec("default factory for gzip codec", GzipCodec.class, codec);
158167
codec = factory.getCodecByName("gzip");
@@ -168,6 +177,8 @@ public void testFinding() {
168177

169178
codec = factory.getCodec(new Path("/tmp/foo.bz2"));
170179
checkCodec("default factory for .bz2", BZip2Codec.class, codec);
180+
codec = factory.getCodec(new Path("/tmp/foo.BZ2"));
181+
checkCodec("default factory for .BZ2", BZip2Codec.class, codec);
171182
codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName());
172183
checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec);
173184
codec = factory.getCodecByName("bzip2");
@@ -221,16 +232,22 @@ public void testFinding() {
221232
FooBarCodec.class});
222233
codec = factory.getCodec(new Path("/tmp/.foo.bar.gz"));
223234
checkCodec("full factory gz codec", GzipCodec.class, codec);
235+
codec = factory.getCodec(new Path("/tmp/.foo.bar.GZ"));
236+
checkCodec("full factory GZ codec", GzipCodec.class, codec);
224237
codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName());
225238
checkCodec("full codec gz codec", GzipCodec.class, codec);
226239

227240
codec = factory.getCodec(new Path("/tmp/foo.bz2"));
228241
checkCodec("full factory for .bz2", BZip2Codec.class, codec);
242+
codec = factory.getCodec(new Path("/tmp/foo.BZ2"));
243+
checkCodec("full factory for .BZ2", BZip2Codec.class, codec);
229244
codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName());
230245
checkCodec("full codec bzip2 codec", BZip2Codec.class, codec);
231246

232247
codec = factory.getCodec(new Path("/tmp/foo.bar"));
233248
checkCodec("full factory bar codec", BarCodec.class, codec);
249+
codec = factory.getCodec(new Path("/tmp/foo.BAR"));
250+
checkCodec("full factory BAR codec", BarCodec.class, codec);
234251
codec = factory.getCodecByClassName(BarCodec.class.getCanonicalName());
235252
checkCodec("full factory bar codec", BarCodec.class, codec);
236253
codec = factory.getCodecByName("bar");
@@ -240,6 +257,8 @@ public void testFinding() {
240257

241258
codec = factory.getCodec(new Path("/tmp/foo/baz.foo.bar"));
242259
checkCodec("full factory foo bar codec", FooBarCodec.class, codec);
260+
codec = factory.getCodec(new Path("/tmp/foo/baz.FOO.bar"));
261+
checkCodec("full factory FOO bar codec", FooBarCodec.class, codec);
243262
codec = factory.getCodecByClassName(FooBarCodec.class.getCanonicalName());
244263
checkCodec("full factory foo bar codec", FooBarCodec.class, codec);
245264
codec = factory.getCodecByName("foobar");
@@ -249,6 +268,8 @@ public void testFinding() {
249268

250269
codec = factory.getCodec(new Path("/tmp/foo.foo"));
251270
checkCodec("full factory foo codec", FooCodec.class, codec);
271+
codec = factory.getCodec(new Path("/tmp/FOO.FOO"));
272+
checkCodec("full factory FOO codec", FooCodec.class, codec);
252273
codec = factory.getCodecByClassName(FooCodec.class.getCanonicalName());
253274
checkCodec("full factory foo codec", FooCodec.class, codec);
254275
codec = factory.getCodecByName("foo");
@@ -259,6 +280,8 @@ public void testFinding() {
259280
factory = setClasses(new Class[]{NewGzipCodec.class});
260281
codec = factory.getCodec(new Path("/tmp/foo.gz"));
261282
checkCodec("overridden factory for .gz", NewGzipCodec.class, codec);
283+
codec = factory.getCodec(new Path("/tmp/foo.GZ"));
284+
checkCodec("overridden factory for .GZ", NewGzipCodec.class, codec);
262285
codec = factory.getCodecByClassName(NewGzipCodec.class.getCanonicalName());
263286
checkCodec("overridden factory for gzip codec", NewGzipCodec.class, codec);
264287

0 commit comments

Comments
 (0)