TEST-modin-project#6830: Use local s3 server instead of public s3 buc…

…kets Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
anmyachev · Jan 17, 2024 · 298eef5 · 298eef5
1 parent 4bdaa49
commit 298eef5
Show file tree

Hide file tree

Showing 4 changed files with 17 additions and 7 deletions.
diff --git a/modin/conftest.py b/modin/conftest.py
@@ -534,8 +534,8 @@ def s3_storage_options(worker_id):
         # to do that is to use the `worker_id`, which is unique, to determine what port to point
         # to. We arbitrarily assign `5` as a worker id to the master worker, since we need a number
         # for each worker, and we never run tests with more than `pytest -n 4`.
-        worker_id = "5" if worker_id == "master" else worker_id.lstrip("gw")
-        url = f"http://127.0.0.1:555{worker_id}/"
+        worker_id = "0" if worker_id == "master" else worker_id.lstrip("gw")
+        url = f"http://127.0.0.1:550{worker_id}/"
     return {"client_kwargs": {"endpoint_url": url}}
 
 
@@ -676,6 +676,9 @@ def s3_resource(s3_base):
     s3 = s3fs.S3FileSystem(client_kwargs={"endpoint_url": s3_base})
 
     s3.rm(bucket, recursive=True)
+    # bucket = conn.Bucket(bucket)
+    # bucket.objects.delete()
+    # bucket.delete()
     for _ in range(20):
         # We want to wait until the deletion finishes.
         if not cli.list_buckets()["Buckets"]:

diff --git a/...sue5159.parquet/part-0000.snappy.parquet/par=a/44c5b23d806c4dc8a97d70c4fb2219f5-0.parquet b/...sue5159.parquet/part-0000.snappy.parquet/par=a/44c5b23d806c4dc8a97d70c4fb2219f5-0.parquet
diff --git a/...sue5159.parquet/part-0000.snappy.parquet/par=b/44c5b23d806c4dc8a97d70c4fb2219f5-0.parquet b/...sue5159.parquet/part-0000.snappy.parquet/par=b/44c5b23d806c4dc8a97d70c4fb2219f5-0.parquet
diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py
@@ -2031,15 +2031,22 @@ def test_read_parquet_5767(self, tmp_path, engine):
         # both Modin and pandas read column "b" as a category
         df_equals(test_df, read_df.astype("int64"))
 
-    def test_read_parquet_s3_with_column_partitioning(self, engine):
-        # This test case comes from
+    def test_read_parquet_s3_with_column_partitioning(
+        self, s3_resource, engine, s3_storage_options
+    ):
         # https://github.com/modin-project/modin/issues/4636
-        dataset_url = "s3://modin-datasets/modin-bugs/modin_bug_5159_parquet/df.parquet"
+        dataset_path = "modin/pandas/test/data/issue5159.parquet"
+        s3_path = "s3://modin-test/modin-bugs/issue5159.parquet"
+
+        # TODO: write files to local s3 storage not through pandas
+        pandas.read_parquet(dataset_path).to_parquet(
+            s3_path, engine=engine, storage_options=s3_storage_options
+        )
         eval_io(
             fn_name="read_parquet",
-            path=dataset_url,
+            path=s3_path,
             engine=engine,
-            storage_options={"anon": True},
+            storage_options=s3_storage_options,
         )