Skip to content

Commit

Permalink
Merge branch 'main' into fix/missindented-categories
Browse files Browse the repository at this point in the history
  • Loading branch information
jaidisido authored Mar 5, 2024
2 parents 8dbd2d0 + 5b20362 commit fc0c8bd
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 24 deletions.
64 changes: 40 additions & 24 deletions tests/unit/test_athena.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,8 +576,19 @@ def test_athena_time_zone(glue_database):
assert df["value"][0].year == datetime.datetime.utcnow().year


@pytest.mark.xfail(raises=NotImplementedError, reason="Unable to create pandas categorical from pyarrow table")
def test_category(path, glue_table, glue_database):
@pytest.mark.parametrize(
"ctas_approach",
[
pytest.param(False),
pytest.param(
True,
marks=pytest.mark.xfail(
raises=NotImplementedError, reason="Unable to create pandas categorical from pyarrow table"
),
),
],
)
def test_category(path: str, glue_table: str, glue_database: str, ctas_approach: bool) -> None:
df = get_df_category()
wr.s3.to_parquet(
df=df,
Expand All @@ -588,37 +599,42 @@ def test_category(path, glue_table, glue_database):
mode="overwrite",
partition_cols=["par0", "par1"],
)
df2 = wr.s3.read_parquet(
path=path,
dataset=True,
pyarrow_additional_kwargs={
"categories": [c for c in df.columns if c not in ["par0", "par1"]],
"strings_to_categorical": True,
},
)
ensure_data_types_category(df2)
df2 = wr.athena.read_sql_query(f"SELECT * FROM {glue_table}", database=glue_database, categories=list(df.columns))
ensure_data_types_category(df2)
df2 = wr.athena.read_sql_table(table=glue_table, database=glue_database, categories=list(df.columns))
ensure_data_types_category(df2)

df2 = wr.athena.read_sql_query(
f"SELECT * FROM {glue_table}", database=glue_database, categories=list(df.columns), ctas_approach=False
f"SELECT * FROM {glue_table}", database=glue_database, categories=list(df.columns), ctas_approach=ctas_approach
)
ensure_data_types_category(df2)
dfs = wr.athena.read_sql_query(
f"SELECT * FROM {glue_table}",


@pytest.mark.parametrize(
"ctas_approach",
[
pytest.param(False),
pytest.param(
True,
marks=pytest.mark.xfail(
raises=NotImplementedError, reason="Unable to create pandas categorical from pyarrow table"
),
),
],
)
def test_category_chunked(path: str, glue_table: str, glue_database: str, ctas_approach: bool) -> None:
df = get_df_category()
wr.s3.to_parquet(
df=df,
path=path,
dataset=True,
database=glue_database,
categories=list(df.columns),
ctas_approach=False,
chunksize=1,
table=glue_table,
mode="overwrite",
partition_cols=["par0", "par1"],
)
for df2 in dfs:
ensure_data_types_category(df2)

dfs = wr.athena.read_sql_query(
f"SELECT * FROM {glue_table}",
database=glue_database,
categories=list(df.columns),
ctas_approach=True,
ctas_approach=ctas_approach,
chunksize=1,
)
for df2 in dfs:
Expand Down
25 changes: 25 additions & 0 deletions tests/unit/test_s3_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
from .._utils import (
assert_pandas_equals,
ensure_data_types,
ensure_data_types_category,
get_df_category,
get_df_list,
is_ray_modin,
to_pandas,
Expand Down Expand Up @@ -888,6 +890,29 @@ def test_chunked_columns(path, columns, chunked):
assert df[columns].shape if columns else df.shape == df2.shape


@pytest.mark.xfail(raises=NotImplementedError, reason="Unable to create pandas categorical from pyarrow table")
def test_category_s3_read_parquet(path: str, glue_table: str, glue_database: str) -> None:
df = get_df_category()
wr.s3.to_parquet(
df=df,
path=path,
dataset=True,
database=glue_database,
table=glue_table,
mode="overwrite",
partition_cols=["par0", "par1"],
)
df2 = wr.s3.read_parquet(
path=path,
dataset=True,
pyarrow_additional_kwargs={
"categories": [c for c in df.columns if c not in ["par0", "par1"]],
"strings_to_categorical": True,
},
)
ensure_data_types_category(df2)


@pytest.mark.xfail(
is_ray_modin,
raises=TypeError,
Expand Down

0 comments on commit fc0c8bd

Please sign in to comment.