From dbff1f4a3e11d808eddf24b816046ab854d5d836 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 26 Mar 2024 23:17:12 +0800 Subject: [PATCH] GH-36026: [C++][ORC] Catch all ORC exceptions to avoid crash (#40697) ### Rationale for this change When /usr/share/zoneinfo is unavailable and TZDIR env is unset, creating C++ ORC reader will crash on Windows. We need to eagerly check this and prevent followup crash. ### What changes are included in this PR? Eagerly check TZDB availability before creating ORC reader/writer. ### Are these changes tested? Yes, added a test case to make sure the check work as expected. ### Are there any user-facing changes? Users on Windows (or other cases when TZDB is not availble) will clearly see this error message instead of crash. * GitHub Issue: #36026 Authored-by: Gang Wu Signed-off-by: Antoine Pitrou --- cpp/src/arrow/adapters/orc/adapter.cc | 58 +++++++++++++++------- cpp/src/arrow/adapters/orc/adapter_test.cc | 19 +++++++ cpp/src/arrow/adapters/orc/util.cc | 8 +++ cpp/src/arrow/adapters/orc/util.h | 3 ++ 4 files changed, 69 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index 2100e701f3302..127ec49ba990f 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -18,17 +18,14 @@ #include "arrow/adapters/orc/adapter.h" #include -#include -#include +#include #include #include #include #include -#include #include #include "arrow/adapters/orc/util.h" -#include "arrow/buffer.h" #include "arrow/builder.h" #include "arrow/io/interfaces.h" #include "arrow/memory_pool.h" @@ -37,14 +34,11 @@ #include "arrow/table.h" #include "arrow/table_builder.h" #include "arrow/type.h" -#include "arrow/type_traits.h" #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/macros.h" -#include "arrow/util/range.h" -#include "arrow/util/visibility.h" #include "orc/Exceptions.hh" // alias to not interfere with nested orc namespace @@ -80,6 +74,12 @@ namespace liborc = orc; } \ catch (const liborc::NotImplementedYet& e) { \ return Status::NotImplemented(e.what()); \ + } \ + catch (const std::exception& e) { \ + return Status::UnknownError(e.what()); \ + } \ + catch (...) { \ + return Status::UnknownError("ORC error"); \ } #define ORC_CATCH_NOT_OK(_s) \ @@ -173,7 +173,7 @@ class OrcStripeReader : public RecordBatchReader { int64_t batch_size_; }; -liborc::RowReaderOptions default_row_reader_options() { +liborc::RowReaderOptions DefaultRowReaderOptions() { liborc::RowReaderOptions options; // Orc timestamp type is error-prone since it serializes values in the writer timezone // and reads them back in the reader timezone. To avoid this, both the Apache Orc C++ @@ -183,6 +183,24 @@ liborc::RowReaderOptions default_row_reader_options() { return options; } +// Proactively check timezone database availability for ORC versions older than 2.0.0 +Status CheckTimeZoneDatabaseAvailability() { + if (GetOrcMajorVersion() >= 2) { + return Status::OK(); + } + auto tz_dir = std::getenv("TZDIR"); + bool is_tzdb_avaiable = tz_dir != nullptr + ? std::filesystem::exists(tz_dir) + : std::filesystem::exists("/usr/share/zoneinfo"); + if (!is_tzdb_avaiable) { + return Status::Invalid( + "IANA time zone database is unavailable but required by ORC." + " Please install it to /usr/share/zoneinfo or set TZDIR env to the installed" + " directory"); + } + return Status::OK(); +} + } // namespace class ORCFileReader::Impl { @@ -332,25 +350,25 @@ class ORCFileReader::Impl { } Result> Read() { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema()); return ReadTable(opts, schema); } Result> Read(const std::shared_ptr& schema) { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); return ReadTable(opts, schema); } Result> Read(const std::vector& include_indices) { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); RETURN_NOT_OK(SelectIndices(&opts, include_indices)); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); return ReadTable(opts, schema); } Result> Read(const std::vector& include_names) { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); RETURN_NOT_OK(SelectNames(&opts, include_names)); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); return ReadTable(opts, schema); @@ -358,13 +376,13 @@ class ORCFileReader::Impl { Result> Read(const std::shared_ptr& schema, const std::vector& include_indices) { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); RETURN_NOT_OK(SelectIndices(&opts, include_indices)); return ReadTable(opts, schema); } Result> ReadStripe(int64_t stripe) { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); RETURN_NOT_OK(SelectStripe(&opts, stripe)); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); return ReadBatch(opts, schema, stripes_[static_cast(stripe)].num_rows); @@ -372,7 +390,7 @@ class ORCFileReader::Impl { Result> ReadStripe( int64_t stripe, const std::vector& include_indices) { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); RETURN_NOT_OK(SelectIndices(&opts, include_indices)); RETURN_NOT_OK(SelectStripe(&opts, stripe)); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); @@ -381,7 +399,7 @@ class ORCFileReader::Impl { Result> ReadStripe( int64_t stripe, const std::vector& include_names) { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); RETURN_NOT_OK(SelectNames(&opts, include_names)); RETURN_NOT_OK(SelectStripe(&opts, stripe)); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); @@ -487,7 +505,7 @@ class ORCFileReader::Impl { return nullptr; } - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); if (!include_indices.empty()) { RETURN_NOT_OK(SelectIndices(&opts, include_indices)); } @@ -508,7 +526,7 @@ class ORCFileReader::Impl { Result> GetRecordBatchReader( int64_t batch_size, const std::vector& include_names) { - liborc::RowReaderOptions opts = default_row_reader_options(); + liborc::RowReaderOptions opts = DefaultRowReaderOptions(); if (!include_names.empty()) { RETURN_NOT_OK(SelectNames(&opts, include_names)); } @@ -541,6 +559,7 @@ ORCFileReader::~ORCFileReader() {} Result> ORCFileReader::Open( const std::shared_ptr& file, MemoryPool* pool) { + RETURN_NOT_OK(CheckTimeZoneDatabaseAvailability()); auto result = std::unique_ptr(new ORCFileReader()); RETURN_NOT_OK(result->impl_->Open(file, pool)); return std::move(result); @@ -779,7 +798,7 @@ class ORCFileWriter::Impl { &(arrow_index_offset[i]), (root->fields)[i])); } root->numElements = (root->fields)[0]->numElements; - writer_->add(*batch); + ORC_CATCH_NOT_OK(writer_->add(*batch)); batch->clear(); num_rows -= batch_size; } @@ -807,6 +826,7 @@ ORCFileWriter::ORCFileWriter() { impl_.reset(new ORCFileWriter::Impl()); } Result> ORCFileWriter::Open( io::OutputStream* output_stream, const WriteOptions& writer_options) { + RETURN_NOT_OK(CheckTimeZoneDatabaseAvailability()); std::unique_ptr result = std::unique_ptr(new ORCFileWriter()); Status status = result->impl_->Open(output_stream, writer_options); diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index 73ecde6b9b576..b9d6c53215b41 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -33,8 +33,10 @@ #include "arrow/status.h" #include "arrow/table.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/matchers.h" #include "arrow/testing/random.h" #include "arrow/type.h" +#include "arrow/util/io_util.h" #include "arrow/util/key_value_metadata.h" namespace liborc = orc; @@ -636,6 +638,23 @@ TEST(TestAdapterReadWrite, FieldAttributesRoundTrip) { AssertSchemaEqual(schema, read_schema, /*check_metadata=*/true); } +TEST(TestAdapterReadWrite, ThrowWhenTZDBUnavaiable) { + if (adapters::orc::GetOrcMajorVersion() >= 2) { + GTEST_SKIP() << "Only ORC pre-2.0.0 versions have the time zone database check"; + } + + EnvVarGuard tzdir_guard("TZDIR", "/wrong/path"); + const char* expect_str = "IANA time zone database is unavailable but required by ORC"; + EXPECT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create(1024)); + EXPECT_THAT( + adapters::orc::ORCFileWriter::Open(out_stream.get(), adapters::orc::WriteOptions()), + Raises(StatusCode::Invalid, testing::HasSubstr(expect_str))); + EXPECT_OK_AND_ASSIGN(auto buffer, out_stream->Finish()); + EXPECT_THAT(adapters::orc::ORCFileReader::Open( + std::make_shared(buffer), default_memory_pool()), + Raises(StatusCode::Invalid, testing::HasSubstr(expect_str))); +} + // Trivial class TestORCWriterTrivialNoWrite : public ::testing::Test {}; diff --git a/cpp/src/arrow/adapters/orc/util.cc b/cpp/src/arrow/adapters/orc/util.cc index f4bdbae6a7b4a..2a74bec1aa6fd 100644 --- a/cpp/src/arrow/adapters/orc/util.cc +++ b/cpp/src/arrow/adapters/orc/util.cc @@ -37,6 +37,7 @@ #include "orc/MemoryPool.hh" #include "orc/OrcFile.hh" +#include "orc/orc-config.hh" // alias to not interfere with nested orc namespace namespace liborc = orc; @@ -1220,6 +1221,13 @@ Result> GetArrowField(const std::string& name, return field(name, std::move(arrow_type), nullable, std::move(metadata)); } +int GetOrcMajorVersion() { + std::stringstream orc_version(ORC_VERSION); + std::string major_version; + std::getline(orc_version, major_version, '.'); + return std::stoi(major_version); +} + } // namespace orc } // namespace adapters } // namespace arrow diff --git a/cpp/src/arrow/adapters/orc/util.h b/cpp/src/arrow/adapters/orc/util.h index 00af9f4b76e67..a18b11dda013f 100644 --- a/cpp/src/arrow/adapters/orc/util.h +++ b/cpp/src/arrow/adapters/orc/util.h @@ -60,6 +60,9 @@ ARROW_EXPORT Status WriteBatch(const ChunkedArray& chunked_array, int64_t length int* arrow_chunk_offset, int64_t* arrow_index_offset, liborc::ColumnVectorBatch* column_vector_batch); +/// \brief Get the major version provided by the official ORC C++ library. +ARROW_EXPORT int GetOrcMajorVersion(); + } // namespace orc } // namespace adapters } // namespace arrow