apache · liujiawinds · Jul 22, 2022 · Jul 25, 2022 · Jul 25, 2022 · Jul 25, 2022
diff --git a/flink-formats/flink-orc-nohive/pom.xml b/flink-formats/flink-orc-nohive/pom.xml
@@ -82,6 +82,18 @@ under the License.
 			</exclusions>
 		</dependency>
 
+		<dependency>
+			<groupId>org.apache.hadoop</groupId>
+			<artifactId>hadoop-common</artifactId>
+			<scope>provided</scope>
+		</dependency>
+
+		<dependency>
+			<groupId>org.apache.hadoop</groupId>
+			<artifactId>hadoop-hdfs</artifactId>
+			<scope>provided</scope>
+		</dependency>
+
 		<!-- Tests -->
 
 		<dependency>

diff --git a/...link-orc-nohive/src/main/java/org/apache/flink/orc/nohive/OrcNoHiveBulkWriterFactory.java b/...link-orc-nohive/src/main/java/org/apache/flink/orc/nohive/OrcNoHiveBulkWriterFactory.java
@@ -20,7 +20,8 @@
 
 import org.apache.flink.api.common.serialization.BulkWriter;
 import org.apache.flink.core.fs.FSDataOutputStream;
-import org.apache.flink.orc.nohive.writer.NoHivePhysicalWriterImpl;
+import org.apache.flink.orc.writer.EncryptionProvider;
+import org.apache.flink.orc.writer.HadoopNoCloseStream;
 import org.apache.flink.table.data.RowData;
 import org.apache.flink.table.types.logical.DecimalType;
 import org.apache.flink.table.types.logical.LocalZonedTimestampType;
@@ -31,6 +32,7 @@
 import org.apache.hadoop.fs.Path;
 import org.apache.orc.OrcFile;
 import org.apache.orc.TypeDescription;
+import org.apache.orc.impl.PhysicalFsWriter;
 import org.apache.orc.impl.WriterImpl;
 import org.apache.orc.storage.common.type.HiveDecimal;
 import org.apache.orc.storage.ql.exec.vector.BytesColumnVector;
@@ -65,7 +67,11 @@ public BulkWriter<RowData> create(FSDataOutputStream out) throws IOException {
         OrcFile.WriterOptions opts = OrcFile.writerOptions(new Properties(), conf);
         TypeDescription description = TypeDescription.fromString(schema);
         opts.setSchema(description);
-        opts.physicalWriter(new NoHivePhysicalWriterImpl(out, opts));
+
+        HadoopNoCloseStream hadoopOutputStream = new HadoopNoCloseStream(out, null);
+        EncryptionProvider provider = new EncryptionProvider(opts);
+        opts.physicalWriter(
+                new PhysicalFsWriter(hadoopOutputStream, opts, provider.getEncryptionVariants()));
         WriterImpl writer = new WriterImpl(null, new Path("."), opts);
 
         VectorizedRowBatch rowBatch = description.createRowBatch();

diff --git a/...orc-nohive/src/main/java/org/apache/flink/orc/nohive/writer/NoHivePhysicalWriterImpl.java b/...orc-nohive/src/main/java/org/apache/flink/orc/nohive/writer/NoHivePhysicalWriterImpl.java
diff --git a/...nohive/src/test/java/org/apache/flink/orc/nohive/OrcColumnarRowSplitReaderNoHiveTest.java b/...nohive/src/test/java/org/apache/flink/orc/nohive/OrcColumnarRowSplitReaderNoHiveTest.java
@@ -47,11 +47,11 @@ protected void prepareReadFileWithTypes(String file, int rowSize) throws IOExcep
         TypeDescription schema =
                 TypeDescription.fromString(
                         "struct<"
-                                + "f0:float,"
-                                + "f1:double,"
-                                + "f2:timestamp,"
-                                + "f3:tinyint,"
-                                + "f4:smallint"
+                                + "_col0:float,"
+                                + "_col1:double,"
+                                + "_col2:timestamp,"
+                                + "_col3:tinyint,"
+                                + "_col4:smallint"
                                 + ">");
 
         org.apache.hadoop.fs.Path filePath = new org.apache.hadoop.fs.Path(file);
@@ -105,7 +105,9 @@ protected OrcColumnarRowSplitReader createReader(
             throws IOException {
         return OrcNoHiveSplitReaderUtil.genPartColumnarRowReader(
                 new Configuration(),
-                IntStream.range(0, fullTypes.length).mapToObj(i -> "f" + i).toArray(String[]::new),
+                IntStream.range(0, fullTypes.length)
+                        .mapToObj(i -> "_col" + i)
+                        .toArray(String[]::new),
                 fullTypes,
                 partitionSpec,
                 selectedFields,

diff --git a/flink-formats/flink-orc/src/main/java/org/apache/flink/orc/writer/EncryptionProvider.java b/flink-formats/flink-orc/src/main/java/org/apache/flink/orc/writer/EncryptionProvider.java
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.orc.writer;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.orc.OrcFile;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.impl.CryptoUtils;
+import org.apache.orc.impl.HadoopShims;
+import org.apache.orc.impl.KeyProvider;
+import org.apache.orc.impl.writer.WriterEncryptionKey;
+import org.apache.orc.impl.writer.WriterEncryptionVariant;
+
+import java.io.IOException;
+import java.security.SecureRandom;
+import java.util.List;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+/**
+ * Copy encryption variants generation code from org.apache.orc:orc-core:1.7.2 {@link
+ * org.apache.orc.impl.WriterImpl}. It's used to get encryption variants which are same as {@link
+ * org.apache.orc.impl.WriterImpl} generated.
+ *
+ * <p>NOTE: This class will be removed after ORC-1200 is merged.
+ */
+public class EncryptionProvider {
+
+    private final SortedMap<String, WriterEncryptionKey> keys = new TreeMap<>();
+
+    private WriterEncryptionVariant[] encryptionVariants;
+
+    public EncryptionProvider(OrcFile.WriterOptions opts) throws IOException {
+        TypeDescription schema = opts.getSchema();
+        schema.annotateEncryption(opts.getEncryption(), opts.getMasks());
+        this.encryptionVariants =
+                setupEncryption(opts.getKeyProvider(), schema, opts.getKeyOverrides());
+    }
+
+    /**
+     * Iterate through the encryption options given by the user and set up our data structures.
+     *
+     * @param provider the KeyProvider to use to generate keys
+     * @param schema the type tree that we search for annotations
+     * @param keyOverrides user specified key overrides
+     */
+    private WriterEncryptionVariant[] setupEncryption(
+            KeyProvider provider,
+            TypeDescription schema,
+            Map<String, HadoopShims.KeyMetadata> keyOverrides)
+            throws IOException {
+        KeyProvider keyProvider =
+                provider != null
+                        ? provider
+                        : CryptoUtils.getKeyProvider(new Configuration(), new SecureRandom());
+        // Load the overrides into the cache so that we use the required key versions.
+        for (HadoopShims.KeyMetadata key : keyOverrides.values()) {
+            keys.put(key.getKeyName(), new WriterEncryptionKey(key));
+        }
+        int variantCount = visitTypeTree(schema, false, keyProvider);
+
+        // Now that we have de-duped the keys and maskDescriptions, make the arrays
+        int nextId = 0;
+        int nextVariantId = 0;
+        WriterEncryptionVariant[] result = new WriterEncryptionVariant[variantCount];
+        for (WriterEncryptionKey key : keys.values()) {
+            key.setId(nextId++);
+            key.sortRoots();
+            for (WriterEncryptionVariant variant : key.getEncryptionRoots()) {
+                result[nextVariantId] = variant;
+                variant.setId(nextVariantId++);
+            }
+        }
+        return result;
+    }
+
+    private int visitTypeTree(TypeDescription schema, boolean encrypted, KeyProvider provider)
+            throws IOException {
+        int result = 0;
+        String keyName = schema.getAttributeValue(TypeDescription.ENCRYPT_ATTRIBUTE);
+        if (keyName != null) {
+            if (provider == null) {
+                throw new IllegalArgumentException("Encryption requires a KeyProvider.");
+            }
+            if (encrypted) {
+                throw new IllegalArgumentException("Nested encryption type: " + schema);
+            }
+            encrypted = true;
+            result += 1;
+            WriterEncryptionKey key = getKey(keyName, provider);
+            HadoopShims.KeyMetadata metadata = key.getMetadata();
+            WriterEncryptionVariant variant =
+                    new WriterEncryptionVariant(key, schema, provider.createLocalKey(metadata));
+            key.addRoot(variant);
+        }
+        List<TypeDescription> children = schema.getChildren();
+        if (children != null) {
+            for (TypeDescription child : children) {
+                result += visitTypeTree(child, encrypted, provider);
+            }
+        }
+        return result;
+    }
+
+    private WriterEncryptionKey getKey(String keyName, KeyProvider provider) throws IOException {
+        WriterEncryptionKey result = keys.get(keyName);
+        if (result == null) {
+            result = new WriterEncryptionKey(provider.getCurrentKeyVersion(keyName));
+            keys.put(keyName, result);
+        }
+        return result;
+    }
+
+    public WriterEncryptionVariant[] getEncryptionVariants() {
+        return encryptionVariants;
+    }
+}
diff --git a/flink-formats/flink-orc/src/main/java/org/apache/flink/orc/writer/HadoopNoCloseStream.java b/flink-formats/flink-orc/src/main/java/org/apache/flink/orc/writer/HadoopNoCloseStream.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.orc.writer;
+
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+
+import java.io.IOException;
+import java.io.OutputStream;
+
+/**
+ * This class is designed to not close the underlying flink stream to avoid exceptions when
+ * checkpointing.
+ */
+public class HadoopNoCloseStream extends FSDataOutputStream {
+
+    public HadoopNoCloseStream(OutputStream out, FileSystem.Statistics stats) throws IOException {
+        super(out, stats);
+    }
+
+    @Override
+    public void close() throws IOException {
+        // Don't close the internal stream here to avoid
+        // Stream Closed or ClosedChannelException when Flink performs checkpoint.
+        // noop
+    }
+}
diff --git a/flink-formats/flink-orc/src/main/java/org/apache/flink/orc/writer/OrcBulkWriterFactory.java b/flink-formats/flink-orc/src/main/java/org/apache/flink/orc/writer/OrcBulkWriterFactory.java
@@ -26,7 +26,9 @@
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
+import org.apache.orc.OrcConf;
 import org.apache.orc.OrcFile;
+import org.apache.orc.impl.PhysicalFsWriter;
 import org.apache.orc.impl.WriterImpl;
 
 import java.io.IOException;
@@ -73,6 +75,13 @@ public OrcBulkWriterFactory(Vectorizer<T> vectorizer, Configuration configuratio
         this(vectorizer, null, configuration);
     }
 
+    public OrcBulkWriterFactory(Vectorizer<T> vectorizer, OrcFile.WriterOptions writerOptions) {
+        this.vectorizer = vectorizer;
+        this.writerOptions = writerOptions;
+        this.writerProperties = null;
+        this.confMap = new HashMap<>();
+    }
+
     /**
      * Creates a new OrcBulkWriterFactory using the provided Vectorizer, Hadoop Configuration, ORC
      * writer properties.
@@ -96,7 +105,10 @@ public OrcBulkWriterFactory(
     @Override
     public BulkWriter<T> create(FSDataOutputStream out) throws IOException {
         OrcFile.WriterOptions opts = getWriterOptions();
-        opts.physicalWriter(new PhysicalWriterImpl(out, opts));
+        HadoopNoCloseStream hadoopOutputStream = new HadoopNoCloseStream(out, null);
+        EncryptionProvider provider = new EncryptionProvider(opts);
+        opts.physicalWriter(
+                new PhysicalFsWriter(hadoopOutputStream, opts, provider.getEncryptionVariants()));
 
         // The path of the Writer is not used to indicate the destination file
         // in this case since we have used a dedicated physical writer to write
@@ -115,9 +127,12 @@ protected OrcFile.WriterOptions getWriterOptions() {
             }
 
             writerOptions = OrcFile.writerOptions(writerProperties, conf);
-            writerOptions.setSchema(this.vectorizer.getSchema());
-        }
 
+            // Column encryption configuration
+            writerOptions.encrypt(OrcConf.ENCRYPTION.getString(writerProperties, conf));
+            writerOptions.masks(OrcConf.DATA_MASK.getString(writerProperties, conf));
+        }
+        writerOptions.setSchema(this.vectorizer.getSchema());
         return writerOptions;
     }
 }