Skip to content

Commit

Permalink
fix: Athena to_iceberg fails with non-lowercase column names
Browse files Browse the repository at this point in the history
  • Loading branch information
LeonLuttenberger committed Mar 18, 2024
1 parent 451939f commit 0ead684
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 0 deletions.
3 changes: 3 additions & 0 deletions awswrangler/athena/_write_iceberg.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,9 @@ def _determine_differences(
)
frame_columns_types.update(frame_partitions_types)

# lowercase DataFrame columns, as all the column names from Athena will be lowercased
frame_columns_types = {k.lower(): v for k, v in frame_columns_types.items()}

catalog_column_types = typing.cast(
Dict[str, str],
catalog.get_table_types(database=database, table=table, catalog_id=catalog_id, boto3_session=boto3_session),
Expand Down
54 changes: 54 additions & 0 deletions tests/unit/test_athena_iceberg.py
Original file line number Diff line number Diff line change
Expand Up @@ -870,3 +870,57 @@ def test_athena_iceberg_use_partition_function(

assert len(df_out) == len(df) + len(df2)
assert len(df_out.columns) == len(df.columns)


def test_to_iceberg_uppercase_columns(
path: str,
path2: str,
path3: str,
glue_database: str,
glue_table: str,
) -> None:
df = pd.DataFrame(
{
"ID": [1, 2, 3, 4, 5],
"TS": [
ts("2020-01-01 00:00:00.0"),
ts("2020-01-02 00:00:01.0"),
ts("2020-01-03 00:00:00.0"),
ts("2020-01-03 12:30:00.0"),
ts("2020-01-03 16:45:00.0"),
],
}
)
df["ID"] = df["ID"].astype("Int64") # Cast as nullable int64 type

split_index = 4

wr.athena.to_iceberg(
df=df.iloc[:split_index],
database=glue_database,
table=glue_table,
table_location=path,
temp_path=path2,
keep_files=False,
)

wr.athena.to_iceberg(
df=df.iloc[split_index:],
database=glue_database,
table=glue_table,
table_location=path,
temp_path=path2,
s3_output=path3,
keep_files=False,
mode="append",
schema_evolution=True,
)

df_output = wr.athena.read_sql_query(
sql=f'SELECT ID, TS FROM "{glue_table}" ORDER BY ID',
database=glue_database,
ctas_approach=False,
unload_approach=False,
)

assert_pandas_equals(df, df_output)

0 comments on commit 0ead684

Please sign in to comment.