Merge branch 'main' into fix/missindented-categories

aws · Mar 5, 2024 · fc0c8bd · fc0c8bd
2 parents 8dbd2d0 + 5b20362
commit fc0c8bd
Show file tree

Hide file tree

Showing 2 changed files with 65 additions and 24 deletions.
diff --git a/tests/unit/test_athena.py b/tests/unit/test_athena.py
@@ -576,8 +576,19 @@ def test_athena_time_zone(glue_database):
     assert df["value"][0].year == datetime.datetime.utcnow().year
 
 
-@pytest.mark.xfail(raises=NotImplementedError, reason="Unable to create pandas categorical from pyarrow table")
-def test_category(path, glue_table, glue_database):
+@pytest.mark.parametrize(
+    "ctas_approach",
+    [
+        pytest.param(False),
+        pytest.param(
+            True,
+            marks=pytest.mark.xfail(
+                raises=NotImplementedError, reason="Unable to create pandas categorical from pyarrow table"
+            ),
+        ),
+    ],
+)
+def test_category(path: str, glue_table: str, glue_database: str, ctas_approach: bool) -> None:
     df = get_df_category()
     wr.s3.to_parquet(
         df=df,
@@ -588,37 +599,42 @@ def test_category(path, glue_table, glue_database):
         mode="overwrite",
         partition_cols=["par0", "par1"],
     )
-    df2 = wr.s3.read_parquet(
-        path=path,
-        dataset=True,
-        pyarrow_additional_kwargs={
-            "categories": [c for c in df.columns if c not in ["par0", "par1"]],
-            "strings_to_categorical": True,
-        },
-    )
-    ensure_data_types_category(df2)
-    df2 = wr.athena.read_sql_query(f"SELECT * FROM {glue_table}", database=glue_database, categories=list(df.columns))
-    ensure_data_types_category(df2)
-    df2 = wr.athena.read_sql_table(table=glue_table, database=glue_database, categories=list(df.columns))
-    ensure_data_types_category(df2)
+
     df2 = wr.athena.read_sql_query(
-        f"SELECT * FROM {glue_table}", database=glue_database, categories=list(df.columns), ctas_approach=False
+        f"SELECT * FROM {glue_table}", database=glue_database, categories=list(df.columns), ctas_approach=ctas_approach
     )
     ensure_data_types_category(df2)
-    dfs = wr.athena.read_sql_query(
-        f"SELECT * FROM {glue_table}",
+
+
+@pytest.mark.parametrize(
+    "ctas_approach",
+    [
+        pytest.param(False),
+        pytest.param(
+            True,
+            marks=pytest.mark.xfail(
+                raises=NotImplementedError, reason="Unable to create pandas categorical from pyarrow table"
+            ),
+        ),
+    ],
+)
+def test_category_chunked(path: str, glue_table: str, glue_database: str, ctas_approach: bool) -> None:
+    df = get_df_category()
+    wr.s3.to_parquet(
+        df=df,
+        path=path,
+        dataset=True,
         database=glue_database,
-        categories=list(df.columns),
-        ctas_approach=False,
-        chunksize=1,
+        table=glue_table,
+        mode="overwrite",
+        partition_cols=["par0", "par1"],
     )
-    for df2 in dfs:
-        ensure_data_types_category(df2)
+
     dfs = wr.athena.read_sql_query(
         f"SELECT * FROM {glue_table}",
         database=glue_database,
         categories=list(df.columns),
-        ctas_approach=True,
+        ctas_approach=ctas_approach,
         chunksize=1,
     )
     for df2 in dfs:

diff --git a/tests/unit/test_s3_parquet.py b/tests/unit/test_s3_parquet.py
@@ -20,6 +20,8 @@
 from .._utils import (
     assert_pandas_equals,
     ensure_data_types,
+    ensure_data_types_category,
+    get_df_category,
     get_df_list,
     is_ray_modin,
     to_pandas,
@@ -888,6 +890,29 @@ def test_chunked_columns(path, columns, chunked):
     assert df[columns].shape if columns else df.shape == df2.shape
 
 
+@pytest.mark.xfail(raises=NotImplementedError, reason="Unable to create pandas categorical from pyarrow table")
+def test_category_s3_read_parquet(path: str, glue_table: str, glue_database: str) -> None:
+    df = get_df_category()
+    wr.s3.to_parquet(
+        df=df,
+        path=path,
+        dataset=True,
+        database=glue_database,
+        table=glue_table,
+        mode="overwrite",
+        partition_cols=["par0", "par1"],
+    )
+    df2 = wr.s3.read_parquet(
+        path=path,
+        dataset=True,
+        pyarrow_additional_kwargs={
+            "categories": [c for c in df.columns if c not in ["par0", "par1"]],
+            "strings_to_categorical": True,
+        },
+    )
+    ensure_data_types_category(df2)
+
+
 @pytest.mark.xfail(
     is_ray_modin,
     raises=TypeError,