From b54664c7b218ac3526551ce5cd57348453c8da4d Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 15 Feb 2020 14:14:07 +0900 Subject: [PATCH 1/3] [C++][Python][CI] Ensure running HDFS related tests libhdfs3 backend may be broken. It causes SEGV. So libhdfs3 backend test is disabled for now. --- .env | 1 + ci/docker/conda-python-hdfs.dockerfile | 6 ++-- ci/scripts/integration_hdfs.sh | 41 ++++++++++++++++++++++++-- cpp/src/arrow/io/hdfs_test.cc | 15 ++++++++-- docker-compose.yml | 1 + python/pyarrow/tests/test_hdfs.py | 12 ++++++-- 6 files changed, 67 insertions(+), 9 deletions(-) diff --git a/.env b/.env index 46c57db29f125..db3a95832c1c1 100644 --- a/.env +++ b/.env @@ -35,6 +35,7 @@ PANDAS=latest DASK=latest TURBODBC=latest HDFS=2.9.2 +LIBHDFS3=2.3 SPARK=master DOTNET=2.1 R=3.6 diff --git a/ci/docker/conda-python-hdfs.dockerfile b/ci/docker/conda-python-hdfs.dockerfile index 3f45ee9d80c28..89f242d5de1ca 100644 --- a/ci/docker/conda-python-hdfs.dockerfile +++ b/ci/docker/conda-python-hdfs.dockerfile @@ -21,11 +21,13 @@ ARG python=3.6 FROM ${repo}:${arch}-conda-python-${python} ARG jdk=8 +ARG libhdfs3=2.3 ARG maven=3.5 RUN conda install -q \ - pandas \ + libhdfs3=${libhdfs3} \ + maven=${maven} \ openjdk=${jdk} \ - maven=${maven} && \ + pandas && \ conda clean --all # installing libhdfs (JNI) diff --git a/ci/scripts/integration_hdfs.sh b/ci/scripts/integration_hdfs.sh index 9408425d0d19f..71be2051c2821 100755 --- a/ci/scripts/integration_hdfs.sh +++ b/ci/scripts/integration_hdfs.sh @@ -22,20 +22,57 @@ set -e source_dir=${1}/cpp build_dir=${2}/cpp -export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob` +export CLASSPATH=$($HADOOP_HOME/bin/hadoop classpath --glob) export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop export LIBHDFS3_CONF=$HADOOP_CONF_DIR/hdfs-site.xml -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HADOOP_HOME/lib/native/ +export ARROW_LIBHDFS3_DIR=$CONDA_PREFIX/lib + +libhdfs_dir=$HADOOP_HOME/lib/native +hadoop_home=$HADOOP_HOME + +function use_hadoop_home() { + unset ARROW_LIBHDFS_DIR + export HADOOP_HOME=$hadoop_home +} + +function use_libhdfs_dir() { + unset HADOOP_HOME + export ARROW_LIBHDFS_DIR=$libhdfs_dir +} + +# TODO: libhdfs3 backend may be broken +if [ -f $CONDA_PREFIX/lib/libhdfs3.so ]; then + mv $CONDA_PREFIX/lib/libhdfs3.so{,.disabled} +fi # execute cpp tests +export ARROW_HDFS_TEST_LIBHDFS_REQUIRE=ON +# TODO: libhdfs3 backend may be broken +# export ARROW_HDFS_TEST_LIBHDFS3_REQUIRE=ON pushd ${build_dir} + debug/arrow-io-hdfs-test debug/arrow-hdfs-test + +use_libhdfs_dir +debug/arrow-io-hdfs-test +debug/arrow-hdfs-test +use_hadoop_home + popd # cannot use --pyargs with custom arguments like --hdfs or --only-hdfs, because # pytest ignores them, see https://github.com/pytest-dev/pytest/issues/3517 export PYARROW_TEST_HDFS=ON +export PYARROW_HDFS_TEST_LIBHDFS_REQUIRE=ON +# TODO: libhdfs3 backend may be broken +# export PYARROW_HDFS_TEST_LIBHDFS3_REQUIRE=ON + +pytest -v --pyargs pyarrow.tests.test_fs +pytest -v --pyargs pyarrow.tests.test_hdfs + +use_libhdfs_dir pytest -v --pyargs pyarrow.tests.test_fs pytest -v --pyargs pyarrow.tests.test_hdfs +use_hadoop_home diff --git a/cpp/src/arrow/io/hdfs_test.cc b/cpp/src/arrow/io/hdfs_test.cc index f3613f1ae3a1a..76393c473e5fd 100644 --- a/cpp/src/arrow/io/hdfs_test.cc +++ b/cpp/src/arrow/io/hdfs_test.cc @@ -107,14 +107,23 @@ class TestHadoopFileSystem : public ::testing::Test { if (DRIVER::type == HdfsDriver::LIBHDFS) { msg = ConnectLibHdfs(&driver_shim); if (!msg.ok()) { - std::cout << "Loading libhdfs failed, skipping tests gracefully" << std::endl; + if (std::getenv("ARROW_HDFS_TEST_LIBHDFS_REQUIRE")) { + FAIL() << "Loading libhdfs failed: " << msg.ToString(); + } else { + std::cout << "Loading libhdfs failed, skipping tests gracefully: " + << msg.ToString() << std::endl; + } return; } } else { msg = ConnectLibHdfs3(&driver_shim); if (!msg.ok()) { - std::cout << "Loading libhdfs3 failed, skipping tests gracefully. " - << msg.ToString() << std::endl; + if (std::getenv("ARROW_HDFS_TEST_LIBHDFS3_REQUIRE")) { + FAIL() << "Loading libhdfs3 failed: " << msg.ToString(); + } else { + std::cout << "Loading libhdfs3 failed, skipping tests gracefully: " + << msg.ToString() << std::endl; + } return; } } diff --git a/docker-compose.yml b/docker-compose.yml index 4b0c3391d8345..f70f22bb92517 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1098,6 +1098,7 @@ services: # be set to ${MAVEN} maven: 3.5 hdfs: ${HDFS} + libhdfs3: ${LIBHDFS3} links: - impala:impala environment: diff --git a/python/pyarrow/tests/test_hdfs.py b/python/pyarrow/tests/test_hdfs.py index f64f0fefcd6bd..ad1dea8200200 100644 --- a/python/pyarrow/tests/test_hdfs.py +++ b/python/pyarrow/tests/test_hdfs.py @@ -384,7 +384,11 @@ class TestLibHdfs(HdfsTestCases, unittest.TestCase): @classmethod def check_driver(cls): if not pa.have_libhdfs(): - pytest.skip('No libhdfs available on system') + message = 'No libhdfs available on system' + if os.environ.get('PYARROW_HDFS_TEST_LIBHDFS_REQUIRE'): + pytest.fail(message) + else: + pytest.skip(message) def test_orphaned_file(self): hdfs = hdfs_test_client() @@ -403,7 +407,11 @@ class TestLibHdfs3(HdfsTestCases, unittest.TestCase): @classmethod def check_driver(cls): if not pa.have_libhdfs3(): - pytest.skip('No libhdfs3 available on system') + message = 'No libhdfs3 available on system' + if os.environ.get('PYARROW_HDFS_TEST_LIBHDFS3_REQUIRE'): + pytest.fail(message) + else: + pytest.skip(message) def _get_hdfs_uri(path): From b311877ca8d381c0e1a1c4798fbb6459891a83f9 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 19 Feb 2020 05:37:54 +0900 Subject: [PATCH 2/3] Remove libhdfs3 support It's no longer maintained. --- .env | 1 - ci/docker/conda-python-hdfs.dockerfile | 2 - ci/scripts/integration_hdfs.sh | 9 --- cpp/src/arrow/filesystem/hdfs.cc | 42 +------------- cpp/src/arrow/filesystem/hdfs.h | 2 - cpp/src/arrow/filesystem/hdfs_test.cc | 53 +++-------------- cpp/src/arrow/io/hdfs.cc | 11 +--- cpp/src/arrow/io/hdfs.h | 6 -- cpp/src/arrow/io/hdfs_internal.cc | 52 ----------------- cpp/src/arrow/io/hdfs_internal.h | 1 - cpp/src/arrow/io/hdfs_test.cc | 80 +++++++------------------- docker-compose.yml | 1 - python/pyarrow/__init__.py | 2 +- python/pyarrow/_hdfs.pyx | 24 +------- python/pyarrow/hdfs.py | 12 ++-- python/pyarrow/io-hdfs.pxi | 25 +------- python/pyarrow/tests/test_fs.py | 6 -- python/pyarrow/tests/test_hdfs.py | 32 ++--------- 18 files changed, 48 insertions(+), 313 deletions(-) diff --git a/.env b/.env index db3a95832c1c1..46c57db29f125 100644 --- a/.env +++ b/.env @@ -35,7 +35,6 @@ PANDAS=latest DASK=latest TURBODBC=latest HDFS=2.9.2 -LIBHDFS3=2.3 SPARK=master DOTNET=2.1 R=3.6 diff --git a/ci/docker/conda-python-hdfs.dockerfile b/ci/docker/conda-python-hdfs.dockerfile index 89f242d5de1ca..7c3a0a02661fb 100644 --- a/ci/docker/conda-python-hdfs.dockerfile +++ b/ci/docker/conda-python-hdfs.dockerfile @@ -21,10 +21,8 @@ ARG python=3.6 FROM ${repo}:${arch}-conda-python-${python} ARG jdk=8 -ARG libhdfs3=2.3 ARG maven=3.5 RUN conda install -q \ - libhdfs3=${libhdfs3} \ maven=${maven} \ openjdk=${jdk} \ pandas && \ diff --git a/ci/scripts/integration_hdfs.sh b/ci/scripts/integration_hdfs.sh index 71be2051c2821..fab879fc11fed 100755 --- a/ci/scripts/integration_hdfs.sh +++ b/ci/scripts/integration_hdfs.sh @@ -40,15 +40,8 @@ function use_libhdfs_dir() { export ARROW_LIBHDFS_DIR=$libhdfs_dir } -# TODO: libhdfs3 backend may be broken -if [ -f $CONDA_PREFIX/lib/libhdfs3.so ]; then - mv $CONDA_PREFIX/lib/libhdfs3.so{,.disabled} -fi - # execute cpp tests export ARROW_HDFS_TEST_LIBHDFS_REQUIRE=ON -# TODO: libhdfs3 backend may be broken -# export ARROW_HDFS_TEST_LIBHDFS3_REQUIRE=ON pushd ${build_dir} debug/arrow-io-hdfs-test @@ -66,8 +59,6 @@ popd export PYARROW_TEST_HDFS=ON export PYARROW_HDFS_TEST_LIBHDFS_REQUIRE=ON -# TODO: libhdfs3 backend may be broken -# export PYARROW_HDFS_TEST_LIBHDFS3_REQUIRE=ON pytest -v --pyargs pyarrow.tests.test_fs pytest -v --pyargs pyarrow.tests.test_hdfs diff --git a/cpp/src/arrow/filesystem/hdfs.cc b/cpp/src/arrow/filesystem/hdfs.cc index 273095abdc552..316a9ae0af524 100644 --- a/cpp/src/arrow/filesystem/hdfs.cc +++ b/cpp/src/arrow/filesystem/hdfs.cc @@ -59,11 +59,7 @@ class HadoopFileSystem::Impl { Status Init() { io::internal::LibHdfsShim* driver_shim; - if (options_.connection_config.driver == io::HdfsDriver::LIBHDFS3) { - RETURN_NOT_OK(ConnectLibHdfs3(&driver_shim)); - } else { - RETURN_NOT_OK(ConnectLibHdfs(&driver_shim)); - } + RETURN_NOT_OK(ConnectLibHdfs(&driver_shim)); RETURN_NOT_OK(io::HadoopFileSystem::Connect(&options_.connection_config, &client_)); return Status::OK(); } @@ -266,14 +262,6 @@ void HdfsOptions::ConfigureEndPoint(const std::string& host, int port) { connection_config.port = port; } -void HdfsOptions::ConfigureHdfsDriver(bool use_hdfs3) { - if (use_hdfs3) { - connection_config.driver = ::arrow::io::HdfsDriver::LIBHDFS3; - } else { - connection_config.driver = ::arrow::io::HdfsDriver::LIBHDFS; - } -} - void HdfsOptions::ConfigureHdfsUser(const std::string& user_name) { connection_config.user = user_name; } @@ -299,32 +287,8 @@ Result HdfsOptions::FromUri(const Uri& uri) { options_map.emplace(kv.first, kv.second); } - auto useHdfs3 = false; - auto it = options_map.find("use_hdfs3"); - if (it != options_map.end()) { - const auto& v = it->second; - if (v == "1") { - options.ConfigureHdfsDriver(true); - useHdfs3 = true; - } else if (v == "0") { - options.ConfigureHdfsDriver(false); - } else { - return Status::Invalid( - "Invalid value for option 'use_hdfs3' (allowed values are '0' and '1'): '", v, - "'"); - } - } - std::string host; - if (useHdfs3) { - if (uri.scheme() == "viewfs") { - ARROW_LOG(WARNING) - << "viewfs://namenode resolves to hdfs://namenode with hdfs3 driver."; - } - host = uri.host(); - } else { - host = uri.scheme() + "://" + uri.host(); - } + host = uri.scheme() + "://" + uri.host(); const auto port = uri.port(); if (port == -1) { @@ -334,7 +298,7 @@ Result HdfsOptions::FromUri(const Uri& uri) { options.ConfigureEndPoint(host, port); } - it = options_map.find("replication"); + auto it = options_map.find("replication"); if (it != options_map.end()) { const auto& v = it->second; ::arrow::internal::StringConverter converter; diff --git a/cpp/src/arrow/filesystem/hdfs.h b/cpp/src/arrow/filesystem/hdfs.h index f96d9fdb18c5b..96fc03654bd63 100644 --- a/cpp/src/arrow/filesystem/hdfs.h +++ b/cpp/src/arrow/filesystem/hdfs.h @@ -42,8 +42,6 @@ struct ARROW_EXPORT HdfsOptions { int64_t default_block_size = 0; void ConfigureEndPoint(const std::string& host, int port); - /// Be cautious that libhdfs3 is a unmaintained project - void ConfigureHdfsDriver(bool use_hdfs3); void ConfigureHdfsReplication(int16_t replication); void ConfigureHdfsUser(const std::string& user_name); void ConfigureHdfsBufferSize(int32_t buffer_size); diff --git a/cpp/src/arrow/filesystem/hdfs_test.cc b/cpp/src/arrow/filesystem/hdfs_test.cc index a0bc9537d7477..701f62377e387 100644 --- a/cpp/src/arrow/filesystem/hdfs_test.cc +++ b/cpp/src/arrow/filesystem/hdfs_test.cc @@ -30,7 +30,6 @@ namespace arrow { using internal::Uri; -using io::HdfsDriver; namespace fs { @@ -44,41 +43,21 @@ TEST(TestHdfsOptions, FromUri) { ASSERT_EQ(options.connection_config.host, "hdfs://localhost"); ASSERT_EQ(options.connection_config.port, 0); ASSERT_EQ(options.connection_config.user, ""); - ASSERT_EQ(options.connection_config.driver, HdfsDriver::LIBHDFS); - ASSERT_OK(uri.Parse("hdfs://otherhost:9999/?use_hdfs3=0&replication=2")); + ASSERT_OK(uri.Parse("hdfs://otherhost:9999/?replication=2")); ASSERT_OK_AND_ASSIGN(options, HdfsOptions::FromUri(uri)); ASSERT_EQ(options.replication, 2); ASSERT_EQ(options.connection_config.host, "hdfs://otherhost"); ASSERT_EQ(options.connection_config.port, 9999); ASSERT_EQ(options.connection_config.user, ""); - ASSERT_EQ(options.connection_config.driver, HdfsDriver::LIBHDFS); - - ASSERT_OK(uri.Parse("hdfs://otherhost:9999/?use_hdfs3=1&user=stevereich")); - ASSERT_OK_AND_ASSIGN(options, HdfsOptions::FromUri(uri)); - ASSERT_EQ(options.replication, 3); - ASSERT_EQ(options.connection_config.host, "otherhost"); - ASSERT_EQ(options.connection_config.port, 9999); - ASSERT_EQ(options.connection_config.user, "stevereich"); - ASSERT_EQ(options.connection_config.driver, HdfsDriver::LIBHDFS3); ASSERT_OK(uri.Parse("viewfs://other-nn/mypath/myfile")); ASSERT_OK_AND_ASSIGN(options, HdfsOptions::FromUri(uri)); ASSERT_EQ(options.connection_config.host, "viewfs://other-nn"); ASSERT_EQ(options.connection_config.port, 0); ASSERT_EQ(options.connection_config.user, ""); - ASSERT_EQ(options.connection_config.driver, HdfsDriver::LIBHDFS); } -struct JNIDriver { - static HdfsDriver type; -}; - -struct PivotalDriver { - static HdfsDriver type; -}; - -template class TestHadoopFileSystem : public ::testing::Test { public: void SetUp() override { @@ -90,15 +69,8 @@ class TestHadoopFileSystem : public ::testing::Test { int hdfs_port = port == nullptr ? 20500 : atoi(port); std::string hdfs_user = user == nullptr ? "root" : std::string(user); - if (DRIVER::type == HdfsDriver::LIBHDFS) { - use_hdfs3_ = false; - } else { - use_hdfs3_ = true; - } - options_.ConfigureEndPoint(hdfs_host, hdfs_port); options_.ConfigureHdfsUser(hdfs_user); - options_.ConfigureHdfsDriver(use_hdfs3_); options_.ConfigureHdfsReplication(0); auto result = HadoopFileSystem::Make(options_); @@ -207,20 +179,13 @@ class TestHadoopFileSystem : public ::testing::Test { bool loaded_driver_ = false; }; -HdfsDriver JNIDriver::type = HdfsDriver::LIBHDFS; -HdfsDriver PivotalDriver::type = HdfsDriver::LIBHDFS3; - -typedef ::testing::Types DriverTypes; - -TYPED_TEST_CASE(TestHadoopFileSystem, DriverTypes); - #define SKIP_IF_NO_DRIVER() \ if (!this->loaded_driver_) { \ ARROW_LOG(INFO) << "Driver not loaded, skipping"; \ return; \ } -TYPED_TEST(TestHadoopFileSystem, CreateDirDeleteDir) { +TEST_F(TestHadoopFileSystem, CreateDirDeleteDir) { SKIP_IF_NO_DRIVER(); // recursive = true @@ -243,7 +208,7 @@ TYPED_TEST(TestHadoopFileSystem, CreateDirDeleteDir) { ASSERT_RAISES(IOError, this->fs_->DeleteDir("AB")); } -TYPED_TEST(TestHadoopFileSystem, DeleteDirContents) { +TEST_F(TestHadoopFileSystem, DeleteDirContents) { SKIP_IF_NO_DRIVER(); ASSERT_OK(this->fs_->CreateDir("AB/CD")); @@ -262,7 +227,7 @@ TYPED_TEST(TestHadoopFileSystem, DeleteDirContents) { ASSERT_OK(this->fs_->DeleteDir("AB")); } -TYPED_TEST(TestHadoopFileSystem, WriteReadFile) { +TEST_F(TestHadoopFileSystem, WriteReadFile) { SKIP_IF_NO_DRIVER(); ASSERT_OK(this->fs_->CreateDir("CD")); @@ -285,21 +250,21 @@ TYPED_TEST(TestHadoopFileSystem, WriteReadFile) { ASSERT_OK(this->fs_->DeleteDir("CD")); } -TYPED_TEST(TestHadoopFileSystem, GetTargetStatsRelative) { +TEST_F(TestHadoopFileSystem, GetTargetStatsRelative) { // Test GetTargetStats() with relative paths SKIP_IF_NO_DRIVER(); this->TestGetTargetStats(""); } -TYPED_TEST(TestHadoopFileSystem, GetTargetStatsAbsolute) { +TEST_F(TestHadoopFileSystem, GetTargetStatsAbsolute) { // Test GetTargetStats() with absolute paths SKIP_IF_NO_DRIVER(); this->TestGetTargetStats("/"); } -TYPED_TEST(TestHadoopFileSystem, RelativeVsAbsolutePaths) { +TEST_F(TestHadoopFileSystem, RelativeVsAbsolutePaths) { SKIP_IF_NO_DRIVER(); // XXX This test assumes the current working directory is not "/" @@ -313,7 +278,7 @@ TYPED_TEST(TestHadoopFileSystem, RelativeVsAbsolutePaths) { AssertFileStats(this->fs_.get(), "CD", FileType::NonExistent); } -TYPED_TEST(TestHadoopFileSystem, MoveDir) { +TEST_F(TestHadoopFileSystem, MoveDir) { SKIP_IF_NO_DRIVER(); FileStats stat; @@ -333,7 +298,7 @@ TYPED_TEST(TestHadoopFileSystem, MoveDir) { ASSERT_OK(this->fs_->DeleteDir(directory_name_dest)); } -TYPED_TEST(TestHadoopFileSystem, FileSystemFromUri) { +TEST_F(TestHadoopFileSystem, FileSystemFromUri) { SKIP_IF_NO_DRIVER(); this->TestFileSystemFromUri(); diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc index e6b90a9c81698..e18553115bb7a 100644 --- a/cpp/src/arrow/io/hdfs.cc +++ b/cpp/src/arrow/io/hdfs.cc @@ -334,11 +334,7 @@ class HadoopFileSystem::HadoopFileSystemImpl { HadoopFileSystemImpl() : driver_(NULLPTR), port_(0), fs_(NULLPTR) {} Status Connect(const HdfsConnectionConfig* config) { - if (config->driver == HdfsDriver::LIBHDFS3) { - RETURN_NOT_OK(ConnectLibHdfs3(&driver_)); - } else { - RETURN_NOT_OK(ConnectLibHdfs(&driver_)); - } + RETURN_NOT_OK(ConnectLibHdfs(&driver_)); // connect to HDFS with the builder object hdfsBuilder* builder = driver_->NewBuilder(); @@ -674,10 +670,5 @@ Status HaveLibHdfs() { return internal::ConnectLibHdfs(&driver); } -Status HaveLibHdfs3() { - internal::LibHdfsShim* driver; - return internal::ConnectLibHdfs3(&driver); -} - } // namespace io } // namespace arrow diff --git a/cpp/src/arrow/io/hdfs.h b/cpp/src/arrow/io/hdfs.h index 2ec2a9551be44..d0e769f132207 100644 --- a/cpp/src/arrow/io/hdfs.h +++ b/cpp/src/arrow/io/hdfs.h @@ -57,17 +57,12 @@ struct HdfsPathInfo { int16_t permissions; }; -enum class HdfsDriver : char { LIBHDFS, LIBHDFS3 }; - struct HdfsConnectionConfig { std::string host; int port; std::string user; std::string kerb_ticket; std::unordered_map extra_conf; - HdfsDriver driver; - - HdfsConnectionConfig() : driver(HdfsDriver::LIBHDFS) {} }; class ARROW_EXPORT HadoopFileSystem : public FileSystem { @@ -258,7 +253,6 @@ class ARROW_EXPORT HdfsOutputStream : public OutputStream { }; Status ARROW_EXPORT HaveLibHdfs(); -Status ARROW_EXPORT HaveLibHdfs3(); } // namespace io } // namespace arrow diff --git a/cpp/src/arrow/io/hdfs_internal.cc b/cpp/src/arrow/io/hdfs_internal.cc index ffd23bb32aa27..ced298f732130 100644 --- a/cpp/src/arrow/io/hdfs_internal.cc +++ b/cpp/src/arrow/io/hdfs_internal.cc @@ -96,7 +96,6 @@ LibraryHandle libjvm_handle = nullptr; // Helper functions for dlopens Result> get_potential_libjvm_paths(); Result> get_potential_libhdfs_paths(); -Result> get_potential_libhdfs3_paths(); Result try_dlopen(const std::vector& potential_paths, const char* name); @@ -166,34 +165,6 @@ Result> get_potential_libhdfs_paths() { return potential_paths; } -Result> get_potential_libhdfs3_paths() { - std::vector potential_paths; - std::string file_name; - -// OS-specific file name -#ifdef _WIN32 - file_name = "hdfs3.dll"; -#elif __APPLE__ - file_name = "libhdfs3.dylib"; -#else - file_name = "libhdfs3.so"; -#endif - - // Common paths - ARROW_ASSIGN_OR_RAISE(auto search_paths, MakeFilenameVector({"", "."})); - - // Path from environment variable - AppendEnvVarFilename("ARROW_LIBHDFS3_DIR", &search_paths); - - // All paths with file name - for (auto& path : search_paths) { - ARROW_ASSIGN_OR_RAISE(auto full_path, path.Join(file_name)); - potential_paths.push_back(std::move(full_path)); - } - - return potential_paths; -} - Result> get_potential_libjvm_paths() { std::vector potential_paths; @@ -304,7 +275,6 @@ Result try_dlopen(const std::vector& potential_ #endif // _WIN32 LibHdfsShim libhdfs_shim; -LibHdfsShim libhdfs3_shim; } // namespace @@ -366,28 +336,6 @@ Status ConnectLibHdfs(LibHdfsShim** driver) { return shim->GetRequiredSymbols(); } -Status ConnectLibHdfs3(LibHdfsShim** driver) { - static std::mutex lock; - std::lock_guard guard(lock); - - LibHdfsShim* shim = &libhdfs3_shim; - - static bool shim_attempted = false; - if (!shim_attempted) { - shim_attempted = true; - - shim->Initialize(); - - ARROW_ASSIGN_OR_RAISE(auto libhdfs3_potential_paths, get_potential_libhdfs3_paths()); - ARROW_ASSIGN_OR_RAISE(shim->handle, try_dlopen(libhdfs3_potential_paths, "libhdfs3")); - } else if (shim->handle == nullptr) { - return Status::IOError("Prior attempt to load libhdfs3 failed"); - } - - *driver = shim; - return shim->GetRequiredSymbols(); -} - /////////////////////////////////////////////////////////////////////////// // HDFS thin wrapper methods diff --git a/cpp/src/arrow/io/hdfs_internal.h b/cpp/src/arrow/io/hdfs_internal.h index 2fd8ab85160c3..cf34fde855a7e 100644 --- a/cpp/src/arrow/io/hdfs_internal.h +++ b/cpp/src/arrow/io/hdfs_internal.h @@ -217,7 +217,6 @@ struct LibHdfsShim { // TODO(wesm): Remove these exports when we are linking statically Status ARROW_EXPORT ConnectLibHdfs(LibHdfsShim** driver); -Status ARROW_EXPORT ConnectLibHdfs3(LibHdfsShim** driver); } // namespace internal } // namespace io diff --git a/cpp/src/arrow/io/hdfs_test.cc b/cpp/src/arrow/io/hdfs_test.cc index 76393c473e5fd..0998f4e8c185c 100644 --- a/cpp/src/arrow/io/hdfs_test.cc +++ b/cpp/src/arrow/io/hdfs_test.cc @@ -47,15 +47,6 @@ std::vector RandomData(int64_t size) { return buffer; } -struct JNIDriver { - static HdfsDriver type; -}; - -struct PivotalDriver { - static HdfsDriver type; -}; - -template class TestHadoopFileSystem : public ::testing::Test { public: Status MakeScratchDir() { @@ -102,30 +93,15 @@ class TestHadoopFileSystem : public ::testing::Test { loaded_driver_ = false; - Status msg; - - if (DRIVER::type == HdfsDriver::LIBHDFS) { - msg = ConnectLibHdfs(&driver_shim); - if (!msg.ok()) { - if (std::getenv("ARROW_HDFS_TEST_LIBHDFS_REQUIRE")) { - FAIL() << "Loading libhdfs failed: " << msg.ToString(); - } else { - std::cout << "Loading libhdfs failed, skipping tests gracefully: " - << msg.ToString() << std::endl; - } - return; - } - } else { - msg = ConnectLibHdfs3(&driver_shim); - if (!msg.ok()) { - if (std::getenv("ARROW_HDFS_TEST_LIBHDFS3_REQUIRE")) { - FAIL() << "Loading libhdfs3 failed: " << msg.ToString(); - } else { - std::cout << "Loading libhdfs3 failed, skipping tests gracefully: " - << msg.ToString() << std::endl; - } - return; + Status msg = ConnectLibHdfs(&driver_shim); + if (!msg.ok()) { + if (std::getenv("ARROW_HDFS_TEST_LIBHDFS_REQUIRE")) { + FAIL() << "Loading libhdfs failed: " << msg.ToString(); + } else { + std::cout << "Loading libhdfs failed, skipping tests gracefully: " + << msg.ToString() << std::endl; } + return; } loaded_driver_ = true; @@ -139,7 +115,6 @@ class TestHadoopFileSystem : public ::testing::Test { conf_.host = host == nullptr ? "localhost" : host; conf_.user = user; conf_.port = port == nullptr ? 20500 : atoi(port); - conf_.driver = DRIVER::type; ASSERT_OK(HadoopFileSystem::Connect(&conf_, &client_)); } @@ -161,26 +136,13 @@ class TestHadoopFileSystem : public ::testing::Test { std::shared_ptr client_; }; -template <> -std::string TestHadoopFileSystem::HdfsAbsPath(const std::string& relpath) { - std::stringstream ss; - ss << relpath; - return ss.str(); -} - #define SKIP_IF_NO_DRIVER() \ if (!this->loaded_driver_) { \ std::cout << "Driver not loaded, skipping" << std::endl; \ return; \ } -HdfsDriver JNIDriver::type = HdfsDriver::LIBHDFS; -HdfsDriver PivotalDriver::type = HdfsDriver::LIBHDFS3; - -typedef ::testing::Types DriverTypes; -TYPED_TEST_CASE(TestHadoopFileSystem, DriverTypes); - -TYPED_TEST(TestHadoopFileSystem, ConnectsAgain) { +TEST_F(TestHadoopFileSystem, ConnectsAgain) { SKIP_IF_NO_DRIVER(); std::shared_ptr client; @@ -188,7 +150,7 @@ TYPED_TEST(TestHadoopFileSystem, ConnectsAgain) { ASSERT_OK(client->Disconnect()); } -TYPED_TEST(TestHadoopFileSystem, MultipleClients) { +TEST_F(TestHadoopFileSystem, MultipleClients) { SKIP_IF_NO_DRIVER(); ASSERT_OK(this->MakeScratchDir()); @@ -205,7 +167,7 @@ TYPED_TEST(TestHadoopFileSystem, MultipleClients) { ASSERT_OK(client2->Disconnect()); } -TYPED_TEST(TestHadoopFileSystem, MakeDirectory) { +TEST_F(TestHadoopFileSystem, MakeDirectory) { SKIP_IF_NO_DRIVER(); std::string path = this->ScratchPath("create-directory"); @@ -224,7 +186,7 @@ TYPED_TEST(TestHadoopFileSystem, MakeDirectory) { ASSERT_RAISES(IOError, this->client_->ListDirectory(path, &listing)); } -TYPED_TEST(TestHadoopFileSystem, GetCapacityUsed) { +TEST_F(TestHadoopFileSystem, GetCapacityUsed) { SKIP_IF_NO_DRIVER(); // Who knows what is actually in your DFS cluster, but expect it to have @@ -237,7 +199,7 @@ TYPED_TEST(TestHadoopFileSystem, GetCapacityUsed) { ASSERT_LT(0, nbytes); } -TYPED_TEST(TestHadoopFileSystem, GetPathInfo) { +TEST_F(TestHadoopFileSystem, GetPathInfo) { SKIP_IF_NO_DRIVER(); HdfsPathInfo info; @@ -267,7 +229,7 @@ TYPED_TEST(TestHadoopFileSystem, GetPathInfo) { ASSERT_EQ(size, info.size); } -TYPED_TEST(TestHadoopFileSystem, GetPathInfoNotExist) { +TEST_F(TestHadoopFileSystem, GetPathInfoNotExist) { // ARROW-2919: Test that the error message is reasonable SKIP_IF_NO_DRIVER(); @@ -284,7 +246,7 @@ TYPED_TEST(TestHadoopFileSystem, GetPathInfoNotExist) { ASSERT_LT(error_message.find(path), std::string::npos); } -TYPED_TEST(TestHadoopFileSystem, AppendToFile) { +TEST_F(TestHadoopFileSystem, AppendToFile) { SKIP_IF_NO_DRIVER(); ASSERT_OK(this->MakeScratchDir()); @@ -303,7 +265,7 @@ TYPED_TEST(TestHadoopFileSystem, AppendToFile) { ASSERT_EQ(size * 2, info.size); } -TYPED_TEST(TestHadoopFileSystem, ListDirectory) { +TEST_F(TestHadoopFileSystem, ListDirectory) { SKIP_IF_NO_DRIVER(); const int size = 100; @@ -343,7 +305,7 @@ TYPED_TEST(TestHadoopFileSystem, ListDirectory) { } } -TYPED_TEST(TestHadoopFileSystem, ReadableMethods) { +TEST_F(TestHadoopFileSystem, ReadableMethods) { SKIP_IF_NO_DRIVER(); ASSERT_OK(this->MakeScratchDir()); @@ -380,7 +342,7 @@ TYPED_TEST(TestHadoopFileSystem, ReadableMethods) { ASSERT_OK_AND_EQ(60, file->Tell()); } -TYPED_TEST(TestHadoopFileSystem, LargeFile) { +TEST_F(TestHadoopFileSystem, LargeFile) { SKIP_IF_NO_DRIVER(); ASSERT_OK(this->MakeScratchDir()); @@ -413,7 +375,7 @@ TYPED_TEST(TestHadoopFileSystem, LargeFile) { ASSERT_EQ(0, std::memcmp(buffer2->data(), data.data(), size)); } -TYPED_TEST(TestHadoopFileSystem, RenameFile) { +TEST_F(TestHadoopFileSystem, RenameFile) { SKIP_IF_NO_DRIVER(); ASSERT_OK(this->MakeScratchDir()); @@ -430,7 +392,7 @@ TYPED_TEST(TestHadoopFileSystem, RenameFile) { ASSERT_TRUE(this->client_->Exists(dst_path)); } -TYPED_TEST(TestHadoopFileSystem, ChmodChown) { +TEST_F(TestHadoopFileSystem, ChmodChown) { SKIP_IF_NO_DRIVER(); ASSERT_OK(this->MakeScratchDir()); @@ -455,7 +417,7 @@ TYPED_TEST(TestHadoopFileSystem, ChmodChown) { ASSERT_EQ("hadoop", info.group); } -TYPED_TEST(TestHadoopFileSystem, ThreadSafety) { +TEST_F(TestHadoopFileSystem, ThreadSafety) { SKIP_IF_NO_DRIVER(); ASSERT_OK(this->MakeScratchDir()); diff --git a/docker-compose.yml b/docker-compose.yml index f70f22bb92517..4b0c3391d8345 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1098,7 +1098,6 @@ services: # be set to ${MAVEN} maven: 3.5 hdfs: ${HDFS} - libhdfs3: ${LIBHDFS3} links: - impala:impala environment: diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index d72c98c452400..d71995c1dae0b 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -120,7 +120,7 @@ def parse_git(root, **kwargs): FixedSizeBufferWriter, BufferReader, BufferOutputStream, OSFile, MemoryMappedFile, memory_map, - create_memory_map, have_libhdfs, have_libhdfs3, + create_memory_map, have_libhdfs, MockOutputStream, input_stream, output_stream) from pyarrow.lib import (ChunkedArray, RecordBatch, Table, diff --git a/python/pyarrow/_hdfs.pyx b/python/pyarrow/_hdfs.pyx index c47394b03db54..0cdb24e835eb2 100644 --- a/python/pyarrow/_hdfs.pyx +++ b/python/pyarrow/_hdfs.pyx @@ -32,10 +32,6 @@ cdef class HdfsOptions: ---------- endpoint : tuple of (host, port) pair For example ('localhost', 8020). - driver : {'libhdfs', 'libhdfs3'}, default 'libhdfs' - Connect using libhdfs (JNI-based) or libhdfs3 (3rd-party C++ library - from Apache HAWQ (incubating)). Prefer libhdfs because libhdfs3 project - is not maintained anymore. replication : int, default 3 Number of copies each block will have. buffer_size : int, default 0 @@ -51,12 +47,10 @@ cdef class HdfsOptions: # Avoid mistakingly creating attributes __slots__ = () - def __init__(self, endpoint=None, driver=None, replication=None, + def __init__(self, endpoint=None, replication=None, user=None, buffer_size=None, default_block_size=None): if endpoint is not None: self.endpoint = endpoint - if driver is not None: - self.driver = driver if replication is not None: self.replication = replication if user is not None: @@ -95,22 +89,6 @@ cdef class HdfsOptions: self.options.connection_config.host = tobytes(value[0]) self.options.connection_config.port = int(value[1]) - @property - def driver(self): - if self.options.connection_config.driver == HdfsDriver_LIBHDFS3: - return 'libhdfs3' - else: - return 'libhdfs' - - @driver.setter - def driver(self, value): - if value == 'libhdfs3': - self.options.connection_config.driver = HdfsDriver_LIBHDFS3 - elif value == 'libhdfs': - self.options.connection_config.driver = HdfsDriver_LIBHDFS - else: - raise ValueError('Choose either libhdfs of libhdfs3') - @property def user(self): return frombytes(self.options.connection_config.user) diff --git a/python/pyarrow/hdfs.py b/python/pyarrow/hdfs.py index 704f1cc67a2ce..f7ee8ec5d967e 100644 --- a/python/pyarrow/hdfs.py +++ b/python/pyarrow/hdfs.py @@ -36,12 +36,11 @@ def __init__(self, host="default", port=0, user=None, kerb_ticket=None, if driver == 'libhdfs': _maybe_set_hadoop_classpath() - self._connect(host, port, user, kerb_ticket, driver, extra_conf) + self._connect(host, port, user, kerb_ticket, extra_conf) def __reduce__(self): return (HadoopFileSystem, (self.host, self.port, self.user, - self.kerb_ticket, self.driver, - self.extra_conf)) + self.kerb_ticket, self.extra_conf)) def _isfilestore(self): """ @@ -177,7 +176,7 @@ def _libhdfs_walk_files_dirs(top_path, contents): def connect(host="default", port=0, user=None, kerb_ticket=None, - driver='libhdfs', extra_conf=None): + extra_conf=None): """ Connect to an HDFS cluster. All parameters are optional and should only be set if the defaults need to be overridden. @@ -192,9 +191,6 @@ def connect(host="default", port=0, user=None, kerb_ticket=None, port : NameNode's port. Set to 0 for default or logical (HA) nodes. user : Username when connecting to HDFS; None implies login user. kerb_ticket : Path to Kerberos ticket cache. - driver : {'libhdfs', 'libhdfs3'}, default 'libhdfs' - Connect using libhdfs (JNI-based) or libhdfs3 (3rd-party C++ - library from Apache HAWQ (incubating) ) extra_conf : dict, default None extra Key/Value pairs for config; Will override any hdfs-site.xml properties @@ -209,6 +205,6 @@ def connect(host="default", port=0, user=None, kerb_ticket=None, filesystem : HadoopFileSystem """ fs = HadoopFileSystem(host=host, port=port, user=user, - kerb_ticket=kerb_ticket, driver=driver, + kerb_ticket=kerb_ticket, extra_conf=extra_conf) return fs diff --git a/python/pyarrow/io-hdfs.pxi b/python/pyarrow/io-hdfs.pxi index b1c97b3ab2e5d..adaf7c2756063 100644 --- a/python/pyarrow/io-hdfs.pxi +++ b/python/pyarrow/io-hdfs.pxi @@ -33,15 +33,6 @@ def have_libhdfs(): return False -def have_libhdfs3(): - try: - with nogil: - check_status(HaveLibHdfs3()) - return True - except Exception: - return False - - def strip_hdfs_abspath(path): m = _HDFS_PATH_RE.match(path) if m: @@ -59,11 +50,10 @@ cdef class HadoopFileSystem: object host object user object kerb_ticket - object driver int port dict extra_conf - def _connect(self, host, port, user, kerb_ticket, driver, extra_conf): + def _connect(self, host, port, user, kerb_ticket, extra_conf): cdef HdfsConnectionConfig conf if host is not None: @@ -81,17 +71,8 @@ cdef class HadoopFileSystem: conf.kerb_ticket = tobytes(kerb_ticket) self.kerb_ticket = kerb_ticket - if driver == 'libhdfs': - with nogil: - check_status(HaveLibHdfs()) - conf.driver = HdfsDriver_LIBHDFS - elif driver == 'libhdfs3': - with nogil: - check_status(HaveLibHdfs3()) - conf.driver = HdfsDriver_LIBHDFS3 - else: - raise ValueError("unknown driver: %r" % driver) - self.driver = driver + with nogil: + check_status(HaveLibHdfs()) if extra_conf is not None and isinstance(extra_conf, dict): conf.extra_conf = {tobytes(k): tobytes(v) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 3a3b6c91f2661..bd9f3cf143c77 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -557,12 +557,6 @@ def test_hdfs_options(hdfs_server): with pytest.raises(TypeError): options.endpoint = 'localhost:8000' - assert options.driver == 'libhdfs' - options.driver = 'libhdfs3' - assert options.driver == 'libhdfs3' - with pytest.raises(ValueError): - options.driver = 'unknown' - assert options.replication == 3 options.replication = 2 assert options.replication == 2 diff --git a/python/pyarrow/tests/test_hdfs.py b/python/pyarrow/tests/test_hdfs.py index ad1dea8200200..9a484aaee5d00 100644 --- a/python/pyarrow/tests/test_hdfs.py +++ b/python/pyarrow/tests/test_hdfs.py @@ -37,7 +37,7 @@ # HDFS tests -def hdfs_test_client(driver='libhdfs'): +def hdfs_test_client(): host = os.environ.get('ARROW_HDFS_TEST_HOST', 'default') user = os.environ.get('ARROW_HDFS_TEST_USER', None) try: @@ -46,7 +46,7 @@ def hdfs_test_client(driver='libhdfs'): raise ValueError('Env variable ARROW_HDFS_TEST_PORT was not ' 'an integer') - return pa.hdfs.connect(host, port, user, driver=driver) + return pa.hdfs.connect(host, port, user) @pytest.mark.hdfs @@ -66,7 +66,7 @@ def _make_test_file(self, hdfs, test_name, test_path, test_data): @classmethod def setUpClass(cls): cls.check_driver() - cls.hdfs = hdfs_test_client(cls.DRIVER) + cls.hdfs = hdfs_test_client() cls.tmp_path = '/tmp/pyarrow-test-{}'.format(random.randint(0, 1000)) cls.hdfs.mkdir(cls.tmp_path) @@ -75,10 +75,6 @@ def tearDownClass(cls): cls.hdfs.delete(cls.tmp_path, recursive=True) cls.hdfs.close() - def test_unknown_driver(self): - with pytest.raises(ValueError): - hdfs_test_client(driver="not_a_driver_name") - def test_pickle(self): s = pickle.dumps(self.hdfs) h2 = pickle.loads(s) @@ -87,7 +83,6 @@ def test_pickle(self): assert h2.port == self.hdfs.port assert h2.user == self.hdfs.user assert h2.kerb_ticket == self.hdfs.kerb_ticket - assert h2.driver == self.hdfs.driver # smoketest unpickled client works h2.ls(self.tmp_path) @@ -379,8 +374,6 @@ def test_write_to_dataset_no_partitions(self): class TestLibHdfs(HdfsTestCases, unittest.TestCase): - DRIVER = 'libhdfs' - @classmethod def check_driver(cls): if not pa.have_libhdfs(): @@ -400,20 +393,6 @@ def test_orphaned_file(self): f = None # noqa -class TestLibHdfs3(HdfsTestCases, unittest.TestCase): - - DRIVER = 'libhdfs3' - - @classmethod - def check_driver(cls): - if not pa.have_libhdfs3(): - message = 'No libhdfs3 available on system' - if os.environ.get('PYARROW_HDFS_TEST_LIBHDFS3_REQUIRE'): - pytest.fail(message) - else: - pytest.skip(message) - - def _get_hdfs_uri(path): host = os.environ.get('ARROW_HDFS_TEST_HOST', 'localhost') try: @@ -430,8 +409,7 @@ def _get_hdfs_uri(path): @pytest.mark.pandas @pytest.mark.parquet @pytest.mark.fastparquet -@pytest.mark.parametrize('client', ['libhdfs', 'libhdfs3']) -def test_fastparquet_read_with_hdfs(client): +def test_fastparquet_read_with_hdfs(): from pandas.testing import assert_frame_equal try: @@ -442,7 +420,7 @@ def test_fastparquet_read_with_hdfs(client): import pyarrow.parquet as pq fastparquet = pytest.importorskip('fastparquet') - fs = hdfs_test_client(client) + fs = hdfs_test_client() df = util.make_dataframe() From 85b45ada81e92d25b5281fa55d6143e94cbd0b44 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 19 Feb 2020 09:05:21 +0900 Subject: [PATCH 3/3] Remove libhdfs3 related code --- cpp/src/arrow/filesystem/hdfs_test.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cpp/src/arrow/filesystem/hdfs_test.cc b/cpp/src/arrow/filesystem/hdfs_test.cc index 701f62377e387..e5ab93a7f3698 100644 --- a/cpp/src/arrow/filesystem/hdfs_test.cc +++ b/cpp/src/arrow/filesystem/hdfs_test.cc @@ -91,9 +91,6 @@ class TestHadoopFileSystem : public ::testing::Test { ss << "hdfs://" << options_.connection_config.host << ":" << options_.connection_config.port << "/" << "?replication=0&user=" << options_.connection_config.user; - if (use_hdfs3_) { - ss << "&use_hdfs3=1"; - } std::shared_ptr uri_fs; std::string path; @@ -174,7 +171,6 @@ class TestHadoopFileSystem : public ::testing::Test { protected: std::shared_ptr fs_; - bool use_hdfs3_; HdfsOptions options_; bool loaded_driver_ = false; };