Skip to content

Commit eea4a54

Browse files
LouisCltpitrou
authored andcommitted
ARROW-17524: [C++] Correction for fields included when reading an ORC table (#13962)
I think there is a bug in the ORC reader : when we specify the fields indexes that we want to keep, it does not work correctly. Looking at the code, it seems to be because we do "includeTypes" in lieue of "include" when setting the ORC options. It can be problematic when we want to import an ORC table containing Union types as it will do an error at the import, even if we try not to import these specific fields. The definitions of the corresponding ORC methods are here : https://github.com/apache/orc/blob/72220851cbde164a22706f8d47741fd1ad3db190/c%2B%2B/src/Options.hh#L185-L191 and https://github.com/apache/orc/blob/72220851cbde164a22706f8d47741fd1ad3db190/c%2B%2B/src/Options.hh#L201-L207 Lead-authored-by: LouisClt <louis1110@hotmail.fr> Co-authored-by: Antoine Pitrou <antoine@python.org> Co-authored-by: Antoine Pitrou <pitrou@free.fr> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent ff80b30 commit eea4a54

File tree

4 files changed

+45
-11
lines changed

4 files changed

+45
-11
lines changed

c_glib/test/test-orc-file-reader.rb

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -185,8 +185,8 @@ def all_columns
185185
test("select fields") do
186186
require_gi_bindings(3, 2, 6)
187187
@reader.field_indices = [1, 3]
188-
assert_equal(build_table("boolean1" => build_boolean_array([false, true]),
189-
"short1" => build_int16_array([1024, 2048])),
188+
assert_equal(build_table("byte1" => build_int8_array([1, 100]),
189+
"int1" => build_int32_array([65536, 65536])),
190190
@reader.read_stripes)
191191
end
192192
end
@@ -200,10 +200,8 @@ def all_columns
200200
test("select fields") do
201201
require_gi_bindings(3, 2, 6)
202202
@reader.field_indices = [1, 3]
203-
boolean1 = build_boolean_array([false, true])
204-
short1 = build_int16_array([1024, 2048])
205-
assert_equal(build_record_batch("boolean1" => boolean1,
206-
"short1" => short1),
203+
assert_equal(build_record_batch("byte1" => build_int8_array([1, 100]),
204+
"int1" => build_int32_array([65536, 65536])),
207205
@reader.read_stripe(0))
208206
end
209207
end

cpp/src/arrow/adapters/orc/adapter.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -411,7 +411,7 @@ class ORCFileReader::Impl {
411411
ARROW_RETURN_IF(*it < 0, Status::Invalid("Negative field index"));
412412
include_indices_list.push_back(*it);
413413
}
414-
opts->includeTypes(include_indices_list);
414+
opts->include(include_indices_list);
415415
return Status::OK();
416416
}
417417

cpp/src/arrow/adapters/orc/adapter_test.cc

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,8 @@ std::shared_ptr<Table> GenerateRandomTable(const std::shared_ptr<Schema>& schema
226226

227227
void AssertTableWriteReadEqual(const std::shared_ptr<Table>& input_table,
228228
const std::shared_ptr<Table>& expected_output_table,
229-
const int64_t max_size = kDefaultSmallMemStreamSize) {
229+
const int64_t max_size = kDefaultSmallMemStreamSize,
230+
std::vector<int>* opt_selected_read_indices = nullptr) {
230231
EXPECT_OK_AND_ASSIGN(auto buffer_output_stream,
231232
io::BufferOutputStream::Create(max_size));
232233
auto write_options = adapters::orc::WriteOptions();
@@ -250,7 +251,11 @@ void AssertTableWriteReadEqual(const std::shared_ptr<Table>& input_table,
250251
ASSERT_EQ(reader->GetCompression(), write_options.compression);
251252
ASSERT_EQ(reader->GetCompressionSize(), write_options.compression_block_size);
252253
ASSERT_EQ(reader->GetRowIndexStride(), write_options.row_index_stride);
253-
EXPECT_OK_AND_ASSIGN(auto actual_output_table, reader->Read());
254+
EXPECT_OK_AND_ASSIGN(auto actual_output_table,
255+
opt_selected_read_indices == nullptr
256+
? reader->Read()
257+
: reader->Read(*opt_selected_read_indices));
258+
ASSERT_OK(actual_output_table->ValidateFull());
254259
AssertTablesEqual(*expected_output_table, *actual_output_table, false, false);
255260
}
256261

@@ -451,6 +456,37 @@ TEST_F(TestORCWriterTrivialNoConversion, writeChunkless) {
451456
std::shared_ptr<Table> table = TableFromJSON(table_schema, {});
452457
AssertTableWriteReadEqual(table, table, kDefaultSmallMemStreamSize / 16);
453458
}
459+
TEST_F(TestORCWriterTrivialNoConversion, writeTrivialChunkAndSelectField) {
460+
std::shared_ptr<Table> table = TableFromJSON(table_schema, {R"([])"});
461+
std::shared_ptr<Schema> schema_selected =
462+
schema({field("int8", int8()), field("int32", int32())});
463+
std::shared_ptr<Table> table_selected = TableFromJSON(schema_selected, {R"([])"});
464+
std::vector<int> selected_indices = {1, 3};
465+
AssertTableWriteReadEqual(table, table_selected, kDefaultSmallMemStreamSize / 16,
466+
&selected_indices);
467+
}
468+
TEST_F(TestORCWriterTrivialNoConversion, writeFilledChunkAndSelectField) {
469+
std::vector<int> selected_indices = {1, 7};
470+
random::RandomArrayGenerator rand(kRandomSeed);
471+
std::shared_ptr<Schema> local_schema = schema({
472+
field("bool", boolean()),
473+
field("int32", int32()),
474+
field("int64", int64()),
475+
field("float", float32()),
476+
field("struct", struct_({field("a", utf8()), field("b", int64())})),
477+
field("double", float64()),
478+
field("date32", date32()),
479+
field("ts3", timestamp(TimeUnit::NANO)),
480+
field("string", utf8()),
481+
field("binary", binary()),
482+
});
483+
auto batch = rand.BatchOf(local_schema->fields(), 100);
484+
std::shared_ptr<Table> table = Table::Make(local_schema, batch->columns());
485+
EXPECT_OK_AND_ASSIGN(auto table_selected, table->SelectColumns(selected_indices));
486+
AssertTableWriteReadEqual(table, table_selected, kDefaultSmallMemStreamSize,
487+
&selected_indices);
488+
}
489+
454490
class TestORCWriterTrivialWithConversion : public ::testing::Test {
455491
public:
456492
TestORCWriterTrivialWithConversion() {

ruby/red-arrow/test/test-orc.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,8 +164,8 @@ def pp_values(values)
164164
]
165165
end
166166
assert_equal([
167-
["boolean1: bool", [pp_values([false, true])]],
168-
["short1: int16", [pp_values([1024, 2048])]],
167+
["byte1: int8", [pp_values([1, 100])]],
168+
["int1: int32", [pp_values([65536, 65536])]],
169169
],
170170
dump)
171171
end

0 commit comments

Comments
 (0)