From 62c8d3bf0ce68edbba22d564534f77d73995b91c Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 20 Dec 2021 17:31:49 +0100 Subject: [PATCH] ARROW-15165: [Python] Expose function to resolve S3 bucket region --- cpp/src/arrow/filesystem/s3fs.cc | 9 +++++-- cpp/src/arrow/filesystem/s3fs.h | 2 +- .../arrow/filesystem/s3fs_narrative_test.cc | 2 +- cpp/src/arrow/filesystem/s3fs_test.cc | 19 +++++++++----- python/pyarrow/_fs.pyx | 10 +++++--- python/pyarrow/_s3fs.pyx | 25 +++++++++++++++++++ python/pyarrow/fs.py | 3 ++- python/pyarrow/includes/libarrow_fs.pxd | 2 ++ python/pyarrow/tests/test_fs.py | 12 +++++++++ 9 files changed, 69 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 1199f39997f07..5a4004d8ca481 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -359,7 +359,7 @@ Result S3Options::FromUri(const Uri& uri, std::string* out_path) { if (!region_set && !bucket.empty() && options.endpoint_override.empty()) { // XXX Should we use a dedicated resolver with the given credentials? - ARROW_ASSIGN_OR_RAISE(options.region, ResolveBucketRegion(bucket)); + ARROW_ASSIGN_OR_RAISE(options.region, ResolveS3BucketRegion(bucket)); } return options; @@ -2474,7 +2474,12 @@ Result> S3FileSystem::OpenAppendStream( // Top-level utility functions // -Result ResolveBucketRegion(const std::string& bucket) { +Result ResolveS3BucketRegion(const std::string& bucket) { + if (bucket.empty() || bucket.find_first_of(kSep) != bucket.npos || + internal::IsLikelyUri(bucket)) { + return Status::Invalid("Not a valid bucket name: '", bucket, "'"); + } + ARROW_ASSIGN_OR_RAISE(auto resolver, RegionResolver::DefaultInstance()); return resolver->ResolveRegion(bucket); } diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h index 502353ef14348..e1b8a0933ceeb 100644 --- a/cpp/src/arrow/filesystem/s3fs.h +++ b/cpp/src/arrow/filesystem/s3fs.h @@ -311,7 +311,7 @@ ARROW_EXPORT Status FinalizeS3(); ARROW_EXPORT -Result ResolveBucketRegion(const std::string& bucket); +Result ResolveS3BucketRegion(const std::string& bucket); } // namespace fs } // namespace arrow diff --git a/cpp/src/arrow/filesystem/s3fs_narrative_test.cc b/cpp/src/arrow/filesystem/s3fs_narrative_test.cc index 0a4b35ff83204..f75ca4bdfd04d 100644 --- a/cpp/src/arrow/filesystem/s3fs_narrative_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_narrative_test.cc @@ -222,7 +222,7 @@ void TestMain(int argc, char** argv) { ASSERT_OK(InitializeS3(options)); if (FLAGS_region.empty()) { - ASSERT_OK_AND_ASSIGN(FLAGS_region, ResolveBucketRegion(FLAGS_bucket)); + ASSERT_OK_AND_ASSIGN(FLAGS_region, ResolveS3BucketRegion(FLAGS_bucket)); } if (FLAGS_clear) { diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index fba431b8fd00b..db1249f12fa28 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -303,28 +303,35 @@ TEST_F(S3OptionsTest, FromAssumeRole) { class S3RegionResolutionTest : public AwsTestMixin {}; TEST_F(S3RegionResolutionTest, PublicBucket) { - ASSERT_OK_AND_EQ("us-east-2", ResolveBucketRegion("ursa-labs-taxi-data")); + ASSERT_OK_AND_EQ("us-east-2", ResolveS3BucketRegion("ursa-labs-taxi-data")); // Taken from a registry of open S3-hosted datasets // at https://github.com/awslabs/open-data-registry - ASSERT_OK_AND_EQ("eu-west-2", ResolveBucketRegion("aws-earth-mo-atmospheric-ukv-prd")); + ASSERT_OK_AND_EQ("eu-west-2", + ResolveS3BucketRegion("aws-earth-mo-atmospheric-ukv-prd")); // Same again, cached - ASSERT_OK_AND_EQ("eu-west-2", ResolveBucketRegion("aws-earth-mo-atmospheric-ukv-prd")); + ASSERT_OK_AND_EQ("eu-west-2", + ResolveS3BucketRegion("aws-earth-mo-atmospheric-ukv-prd")); } TEST_F(S3RegionResolutionTest, RestrictedBucket) { - ASSERT_OK_AND_EQ("us-west-2", ResolveBucketRegion("ursa-labs-r-test")); + ASSERT_OK_AND_EQ("us-west-2", ResolveS3BucketRegion("ursa-labs-r-test")); // Same again, cached - ASSERT_OK_AND_EQ("us-west-2", ResolveBucketRegion("ursa-labs-r-test")); + ASSERT_OK_AND_EQ("us-west-2", ResolveS3BucketRegion("ursa-labs-r-test")); } TEST_F(S3RegionResolutionTest, NonExistentBucket) { - auto maybe_region = ResolveBucketRegion("ursa-labs-non-existent-bucket"); + auto maybe_region = ResolveS3BucketRegion("ursa-labs-non-existent-bucket"); ASSERT_RAISES(IOError, maybe_region); ASSERT_THAT(maybe_region.status().message(), ::testing::HasSubstr("Bucket 'ursa-labs-non-existent-bucket' not found")); } +TEST_F(S3RegionResolutionTest, InvalidBucketName) { + ASSERT_RAISES(Invalid, ResolveS3BucketRegion("s3:bucket")); + ASSERT_RAISES(Invalid, ResolveS3BucketRegion("foo/bar")); +} + //////////////////////////////////////////////////////////////////////////// // S3FileSystem region test diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 24e02f5ded193..bb59d1852c470 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -336,15 +336,17 @@ cdef class FileSystem(_Weakrefable): the FileSystem instance. """ cdef: - c_string path + c_string c_path + c_string c_uri CResult[shared_ptr[CFileSystem]] result if isinstance(uri, pathlib.Path): # Make absolute uri = uri.resolve().absolute() - uri = _stringify_path(uri) - result = CFileSystemFromUriOrPath(tobytes(uri), &path) - return FileSystem.wrap(GetResultValue(result)), frombytes(path) + c_uri = tobytes(_stringify_path(uri)) + with nogil: + result = CFileSystemFromUriOrPath(c_uri, &c_path) + return FileSystem.wrap(GetResultValue(result)), frombytes(c_path) cdef init(self, const shared_ptr[CFileSystem]& wrapped): self.wrapped = wrapped diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index 5829d74d31fa0..403b8105e7cab 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -54,6 +54,31 @@ def finalize_s3(): check_status(CFinalizeS3()) +def resolve_s3_region(bucket): + """ + Resolve the S3 region of a bucket. + + Parameters + ---------- + bucket : str + A S3 bucket name + + Returns + ------- + region : str + A S3 region name + """ + cdef: + c_string c_bucket + c_string c_region + + c_bucket = tobytes(bucket) + with nogil: + c_region = GetResultValue(ResolveS3BucketRegion(c_bucket)) + + return frombytes(c_region) + + cdef class S3FileSystem(FileSystem): """ S3-backed FileSystem implementation diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py index b3b27f24eac25..ae611557ac039 100644 --- a/python/pyarrow/fs.py +++ b/python/pyarrow/fs.py @@ -47,7 +47,8 @@ try: from pyarrow._s3fs import ( # noqa - S3FileSystem, S3LogLevel, initialize_s3, finalize_s3) + S3FileSystem, S3LogLevel, initialize_s3, finalize_s3, + resolve_s3_region) except ImportError: _not_imported.append("S3FileSystem") else: diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index 9dca5fbf62d9f..1198784a16789 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -197,6 +197,8 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: const CS3GlobalOptions& options) cdef CStatus CFinalizeS3 "arrow::fs::FinalizeS3"() + cdef CResult[c_string] ResolveS3BucketRegion(const c_string& bucket) + cdef cppclass CHdfsOptions "arrow::fs::HdfsOptions": HdfsConnectionConfig connection_config int32_t buffer_size diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index e51310b559adf..90aeffed4a522 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -1630,6 +1630,18 @@ def test_s3_real_aws_region_selection(): assert fs.region == 'us-east-3' +@pytest.mark.s3 +def test_resolve_s3_region(): + from pyarrow.fs import resolve_s3_region + assert resolve_s3_region('ursa-labs-taxi-data') == 'us-east-2' + assert resolve_s3_region('mf-nwp-models') == 'eu-west-1' + + with pytest.raises(ValueError, match="Not a valid bucket name"): + resolve_s3_region('foo/bar') + with pytest.raises(ValueError, match="Not a valid bucket name"): + resolve_s3_region('s3:bucket') + + @pytest.mark.s3 def test_copy_files(s3_connection, s3fs, tempdir): fs = s3fs["fs"]