Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-36026: [C++][ORC] Catch all ORC exceptions to avoid crash #40697

Merged
merged 6 commits into from
Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 2 additions & 20 deletions cpp/src/arrow/adapters/orc/adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ namespace liborc = orc;
return Status::NotImplemented(e.what()); \
} \
catch (const std::exception& e) { \
wgtmac marked this conversation as resolved.
Show resolved Hide resolved
return Status::Invalid(e.what()); \
return Status::UnknownError(e.what()); \
} \
catch (...) { \
return Status::UnknownError("ORC error"); \
Expand Down Expand Up @@ -183,22 +183,6 @@ liborc::RowReaderOptions default_row_reader_options() {
return options;
}

// Proactively check the availability of timezone database.
// Remove it once https://issues.apache.org/jira/browse/ORC-1661 has been fixed.
Status check_timezone_database_availability() {
auto tz_dir = std::getenv("TZDIR");
bool is_tzdb_avaiable = tz_dir != nullptr
wgtmac marked this conversation as resolved.
Show resolved Hide resolved
? std::filesystem::exists(tz_dir)
: std::filesystem::exists("/usr/share/zoneinfo");
if (!is_tzdb_avaiable) {
return Status::Invalid(
"IANA timezone database is unavailable but required by ORC."
" Please install it to /usr/share/zoneinfo or set TZDIR env to the installed"
" directory");
}
return Status::OK();
}

} // namespace

class ORCFileReader::Impl {
Expand Down Expand Up @@ -557,7 +541,6 @@ ORCFileReader::~ORCFileReader() {}

Result<std::unique_ptr<ORCFileReader>> ORCFileReader::Open(
const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool) {
RETURN_NOT_OK(check_timezone_database_availability());
auto result = std::unique_ptr<ORCFileReader>(new ORCFileReader());
RETURN_NOT_OK(result->impl_->Open(file, pool));
return std::move(result);
Expand Down Expand Up @@ -796,7 +779,7 @@ class ORCFileWriter::Impl {
&(arrow_index_offset[i]), (root->fields)[i]));
}
root->numElements = (root->fields)[0]->numElements;
writer_->add(*batch);
ORC_CATCH_NOT_OK(writer_->add(*batch));
batch->clear();
num_rows -= batch_size;
}
Expand Down Expand Up @@ -824,7 +807,6 @@ ORCFileWriter::ORCFileWriter() { impl_.reset(new ORCFileWriter::Impl()); }

Result<std::unique_ptr<ORCFileWriter>> ORCFileWriter::Open(
io::OutputStream* output_stream, const WriteOptions& writer_options) {
RETURN_NOT_OK(check_timezone_database_availability());
std::unique_ptr<ORCFileWriter> result =
std::unique_ptr<ORCFileWriter>(new ORCFileWriter());
Status status = result->impl_->Open(output_stream, writer_options);
Expand Down
35 changes: 22 additions & 13 deletions cpp/src/arrow/adapters/orc/adapter_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -638,28 +638,37 @@ TEST(TestAdapterReadWrite, FieldAttributesRoundTrip) {
AssertSchemaEqual(schema, read_schema, /*check_metadata=*/true);
}

TEST(TestAdapterReadWrite, ThrowWhenTZDBUnavaiable) {
// Backup the original TZDIR env and set a wrong value by purpose to trigger the check.
TEST(TestAdapterReadWrite, catchTimezoneError) {
// Write a simple ORC file with timestamp type.
auto table_schema = schema({field("a", timestamp(TimeUnit::NANO))});
const auto table = GenerateRandomTable(table_schema, /*size=1*/ 1, /*min_num_chunks=*/1,
/*max_num_chunks=*/1, /*null_probability=*/0);
EXPECT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create(4096));
EXPECT_OK_AND_ASSIGN(auto writer, adapters::orc::ORCFileWriter::Open(out_stream.get()));
ARROW_EXPECT_OK(writer->Write(*table));
ARROW_EXPECT_OK(writer->Close());
EXPECT_OK_AND_ASSIGN(auto buffer, out_stream->Finish());

// Backup the original TZDIR env and set a wrong value by purpose.
const char* tzdir_env_key = "TZDIR";
const char* expect_str = "IANA timezone database is unavailable but required by ORC";
auto tzdir_env_backup = std::getenv(tzdir_env_key);
ARROW_EXPECT_OK(arrow::internal::SetEnvVar(tzdir_env_key, "/a/b/c/d/e"));
ARROW_EXPECT_OK(arrow::internal::SetEnvVar(tzdir_env_key, "/invalid_path"));

EXPECT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create(1024));
EXPECT_THAT(
adapters::orc::ORCFileWriter::Open(out_stream.get(), adapters::orc::WriteOptions()),
Raises(StatusCode::Invalid, testing::HasSubstr(expect_str)));
EXPECT_OK_AND_ASSIGN(auto reader, adapters::orc::ORCFileReader::Open(
std::make_shared<io::BufferReader>(buffer),
default_memory_pool()));

EXPECT_OK_AND_ASSIGN(auto buffer, out_stream->Finish());
EXPECT_THAT(adapters::orc::ORCFileReader::Open(
std::make_shared<io::BufferReader>(buffer), default_memory_pool()),
Raises(StatusCode::Invalid, testing::HasSubstr(expect_str)));
// Verify that we have caught the orc::TimezoneError exception.
EXPECT_THAT(reader->Read(),
Raises(StatusCode::UnknownError, testing::HasSubstr("/invalid_path")));

// Restore TZDIR env.
// Restore TZDIR env to verify timestamp values can be read successfully.
ARROW_EXPECT_OK(arrow::internal::DelEnvVar(tzdir_env_key));
if (tzdir_env_backup) {
ARROW_EXPECT_OK(arrow::internal::SetEnvVar(tzdir_env_key, tzdir_env_backup));
}
EXPECT_OK_AND_ASSIGN(auto read_table, reader->Read());
EXPECT_TRUE(read_table->Equals(*table));
}

// Trivial
Expand Down
Loading