|
31 | 31 | import org.apache.iceberg.io.CloseableIterable; |
32 | 32 | import org.apache.iceberg.orc.ORC; |
33 | 33 | import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; |
| 34 | +import org.apache.iceberg.relocated.com.google.common.collect.Iterators; |
34 | 35 | import org.apache.iceberg.relocated.com.google.common.collect.Lists; |
35 | 36 | import org.apache.iceberg.spark.data.vectorized.VectorizedSparkOrcReaders; |
36 | 37 | import org.apache.iceberg.types.Types; |
|
39 | 40 | import org.apache.orc.Writer; |
40 | 41 | import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; |
41 | 42 | import org.apache.orc.storage.ql.exec.vector.LongColumnVector; |
| 43 | +import org.apache.orc.storage.ql.exec.vector.StructColumnVector; |
42 | 44 | import org.apache.orc.storage.ql.exec.vector.UnionColumnVector; |
43 | 45 | import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; |
44 | 46 | import org.apache.spark.sql.catalyst.InternalRow; |
@@ -215,4 +217,100 @@ public void testSingleComponentUnion() throws IOException { |
215 | 217 | assertEquals(expectedSchema, expectedSecondRow, rowIterator.next()); |
216 | 218 | } |
217 | 219 | } |
| 220 | + |
| 221 | + @Test |
| 222 | + public void testDeeplyNestedUnion() throws IOException { |
| 223 | + TypeDescription orcSchema = |
| 224 | + TypeDescription.fromString("struct<c1:uniontype<int,struct<c2:string,c3:uniontype<int,string>>>>"); |
| 225 | + |
| 226 | + Schema expectedSchema = new Schema( |
| 227 | + Types.NestedField.optional(0, "c1", Types.StructType.of( |
| 228 | + Types.NestedField.optional(1, "tag_0", Types.IntegerType.get()), |
| 229 | + Types.NestedField.optional(2, "tag_1", |
| 230 | + Types.StructType.of(Types.NestedField.optional(3, "c2", Types.StringType.get()), |
| 231 | + Types.NestedField.optional(4, "c3", Types.StructType.of( |
| 232 | + Types.NestedField.optional(5, "tag_0", Types.IntegerType.get()), |
| 233 | + Types.NestedField.optional(6, "tag_1", Types.StringType.get())))))))); |
| 234 | + |
| 235 | + final InternalRow expectedFirstRow = new GenericInternalRow(1); |
| 236 | + final InternalRow inner1 = new GenericInternalRow(2); |
| 237 | + inner1.update(0, null); |
| 238 | + final InternalRow inner2 = new GenericInternalRow(2); |
| 239 | + inner2.update(0, UTF8String.fromString("foo0")); |
| 240 | + final InternalRow inner3 = new GenericInternalRow(2); |
| 241 | + inner3.update(0, 0); |
| 242 | + inner3.update(1, null); |
| 243 | + inner2.update(1, inner3); |
| 244 | + inner1.update(1, inner2); |
| 245 | + expectedFirstRow.update(0, inner1); |
| 246 | + |
| 247 | + Configuration conf = new Configuration(); |
| 248 | + |
| 249 | + File orcFile = temp.newFile(); |
| 250 | + Path orcFilePath = new Path(orcFile.getPath()); |
| 251 | + |
| 252 | + Writer writer = OrcFile.createWriter(orcFilePath, |
| 253 | + OrcFile.writerOptions(conf) |
| 254 | + .setSchema(orcSchema).overwrite(true)); |
| 255 | + |
| 256 | + VectorizedRowBatch batch = orcSchema.createRowBatch(); |
| 257 | + UnionColumnVector innerUnion1 = (UnionColumnVector) batch.cols[0]; |
| 258 | + LongColumnVector innerInt1 = (LongColumnVector) innerUnion1.fields[0]; |
| 259 | + innerInt1.fillWithNulls(); |
| 260 | + StructColumnVector innerStruct2 = (StructColumnVector) innerUnion1.fields[1]; |
| 261 | + BytesColumnVector innerString2 = (BytesColumnVector) innerStruct2.fields[0]; |
| 262 | + UnionColumnVector innerUnion3 = (UnionColumnVector) innerStruct2.fields[1]; |
| 263 | + LongColumnVector innerInt3 = (LongColumnVector) innerUnion3.fields[0]; |
| 264 | + BytesColumnVector innerString3 = (BytesColumnVector) innerUnion3.fields[1]; |
| 265 | + innerString3.fillWithNulls(); |
| 266 | + |
| 267 | + for (int r = 0; r < NUM_OF_ROWS; ++r) { |
| 268 | + int row = batch.size++; |
| 269 | + innerUnion1.tags[row] = 1; |
| 270 | + innerString2.setVal(row, ("foo" + row).getBytes(StandardCharsets.UTF_8)); |
| 271 | + innerUnion3.tags[row] = 0; |
| 272 | + innerInt3.vector[row] = r; |
| 273 | + // If the batch is full, write it out and start over. |
| 274 | + if (batch.size == batch.getMaxSize()) { |
| 275 | + writer.addRowBatch(batch); |
| 276 | + batch.reset(); |
| 277 | + innerInt1.fillWithNulls(); |
| 278 | + innerString3.fillWithNulls(); |
| 279 | + } |
| 280 | + } |
| 281 | + if (batch.size != 0) { |
| 282 | + writer.addRowBatch(batch); |
| 283 | + batch.reset(); |
| 284 | + } |
| 285 | + writer.close(); |
| 286 | + |
| 287 | + // test non-vectorized reader |
| 288 | + List<InternalRow> results = Lists.newArrayList(); |
| 289 | + try (CloseableIterable<InternalRow> reader = ORC.read(Files.localInput(orcFile)) |
| 290 | + .project(expectedSchema) |
| 291 | + .createReaderFunc(readOrcSchema -> new SparkOrcReader(expectedSchema, readOrcSchema)) |
| 292 | + .build()) { |
| 293 | + reader.forEach(results::add); |
| 294 | + final InternalRow actualFirstRow = results.get(0); |
| 295 | + |
| 296 | + Assert.assertEquals(results.size(), NUM_OF_ROWS); |
| 297 | + assertEquals(expectedSchema, expectedFirstRow, actualFirstRow); |
| 298 | + } |
| 299 | + |
| 300 | + // test vectorized reader |
| 301 | + try (CloseableIterable<ColumnarBatch> reader = ORC.read(Files.localInput(orcFile)) |
| 302 | + .project(expectedSchema) |
| 303 | + .createBatchedReaderFunc(readOrcSchema -> |
| 304 | + VectorizedSparkOrcReaders.buildReader(expectedSchema, readOrcSchema, ImmutableMap.of())) |
| 305 | + .build()) { |
| 306 | + final Iterator<InternalRow> actualRows = batchesToRows(reader.iterator()); |
| 307 | + final InternalRow actualFirstRow = actualRows.next(); |
| 308 | + |
| 309 | + assertEquals(expectedSchema, expectedFirstRow, actualFirstRow); |
| 310 | + } |
| 311 | + } |
| 312 | + |
| 313 | + private Iterator<InternalRow> batchesToRows(Iterator<ColumnarBatch> batches) { |
| 314 | + return Iterators.concat(Iterators.transform(batches, ColumnarBatch::rowIterator)); |
| 315 | + } |
218 | 316 | } |
0 commit comments