bodo-ai · IsaacWarren · Jan 2, 2025 · Dec 30, 2024 · Dec 30, 2024 · Dec 30, 2024
diff --git a/.github/workflows/_test_python_source.yml b/.github/workflows/_test_python_source.yml
@@ -51,6 +51,7 @@ jobs:
             role-to-assume: arn:aws:iam::427443013497:role/BodoEngineNightlyRole
             role-session-name: BodoEnginePrCiSession
             role-skip-session-tagging: true
+            role-duration-seconds: 10800
 
         # Run Tests
         - name: Run Tests

diff --git a/bodo/io/iceberg.py b/bodo/io/iceberg.py
@@ -131,11 +131,12 @@ def format_iceberg_conn(conn_str: str) -> str:
         "iceberg+abfs",
         "iceberg+abfss",
         "iceberg+rest",
+        "iceberg+arn",
     ):
         raise BodoError(
             "'con' must start with one of the following: 'iceberg://', 'iceberg+file://', "
             "'iceberg+s3://', 'iceberg+thrift://', 'iceberg+http://', 'iceberg+https://', 'iceberg+glue', 'iceberg+snowflake://', "
-            "'iceberg+abfs://', 'iceberg+abfss://', 'iceberg+rest://'"
+            "'iceberg+abfs://', 'iceberg+abfss://', 'iceberg+rest://', 'iceberg+arn'"
         )
 
     # Remove Iceberg Prefix when using Internally

diff --git a/bodo/tests/conftest.py b/bodo/tests/conftest.py
@@ -279,7 +279,6 @@ def minio_server():
     # For compatibility with older MinIO versions.
     os.environ["MINIO_ACCESS_KEY"] = access_key
     os.environ["MINIO_SECRET_KEY"] = secret_key
-    os.environ["AWS_S3_ENDPOINT"] = f"http://{address}/"
 
     args = [
         "minio",
@@ -313,6 +312,7 @@ def minio_server_with_s3_envs(minio_server: tuple[str, str, str]):
             "AWS_ACCESS_KEY_ID": minio_server[0],
             "AWS_SECRET_ACCESS_KEY": minio_server[1],
             "AWS_SESSION_TOKEN": None,
+            "AWS_S3_ENDPOINT": f"http://{minio_server[2]}/",
         }
     ):
         yield minio_server

diff --git a/bodo/tests/test_s3_tables_iceberg.py b/bodo/tests/test_s3_tables_iceberg.py
@@ -0,0 +1,132 @@
+import random
+import string
+from io import StringIO
+
+import boto3
+import pandas as pd
+import pytest
+
+import bodo
+from bodo.tests.user_logging_utils import (
+    check_logger_msg,
+    create_string_io_logger,
+    set_logging_stream,
+)
+from bodo.tests.utils import (
+    _get_dist_arg,
+    check_func,
+    run_rank0,
+    temp_env_override,
+)
+
+pytest_mark = pytest.mark.iceberg
+
+bucket_arn = "arn:aws:s3tables:us-east-2:427443013497:bucket/unittest-bucket"
+
+
+@temp_env_override({"AWS_REGION": "us-east-2"})
+def test_basic_read(memory_leak_check):
+    """
+    Test reading a complete Iceberg table S3 Tables
+    """
+
+    def impl(table_name, conn, db_schema):
+        return pd.read_sql_table(table_name, conn, db_schema)
+
+    py_out = pd.DataFrame(
+        {
+            "A": ["ally", "bob", "cassie", "david", None],
+            "B": [10.5, -124.0, 11.11, 456.2, -8e2],
+            "C": [True, None, False, None, None],
+        }
+    )
+
+    conn = "iceberg+" + bucket_arn
+    check_func(
+        impl,
+        ("bodo_iceberg_read_test", conn, "read_namespace"),
+        py_output=py_out,
+        sort_output=True,
+        reset_index=True,
+    )
+
+
+@temp_env_override({"AWS_REGION": "us-east-2"})
+def test_read_implicit_pruning(memory_leak_check):
+    """
+    Test reading an Iceberg table from S3 Tables with Bodo
+    compiler column pruning
+    """
+
+    def impl(table_name, conn, db_schema):
+        df = pd.read_sql_table(table_name, conn, db_schema)
+        df["B"] = df["B"].abs()
+        return df[["B", "A"]]
+
+    py_out = pd.DataFrame(
+        {
+            "B": [10.5, 124.0, 11.11, 456.2, 8e2],
+            "A": ["ally", "bob", "cassie", "david", None],
+        }
+    )
+
+    conn = "iceberg+" + bucket_arn
+    stream = StringIO()
+    logger = create_string_io_logger(stream)
+    with set_logging_stream(logger, 1):
+        check_func(
+            impl,
+            ("bodo_iceberg_read_test", conn, "read_namespace"),
+            py_output=py_out,
+            sort_output=True,
+            reset_index=True,
+        )
+        check_logger_msg(stream, "Columns loaded ['A', 'B']")
+
+
+@temp_env_override({"AWS_REGION": "us-east-2"})
+@temp_env_override({"AWS_DEFAULT_REGION": "us-east-2"})
+def test_basic_write(memory_leak_check):
+    """
+    Test writing a complete Iceberg table to S3 Tables
+    """
+
+    @bodo.jit(distributed=["df"])
+    def write(df, table_name, conn, db_schema):
+        df.to_sql(table_name, conn, db_schema)
+
+    def read(table_name, conn, db_schema):
+        return pd.read_sql_table(table_name, conn, db_schema)
+
+    df = pd.DataFrame(
+        {
+            "A": ["ally", "bob", "cassie", "david", None] * 5,
+            "B": [10.5, -124.0, 11.11, 456.2, -8e2] * 5,
+            "C": [True, None, False, None, None] * 5,
+        }
+    )
+    conn = "iceberg+" + bucket_arn
+    table_name = f"bodo_iceberg_write_test_{''.join(random.choices(string.ascii_lowercase , k=4))}"
+
+    try:
+        write(_get_dist_arg(df), table_name, conn, "write_namespace")
+
+        check_func(
+            read,
+            (table_name, conn, "write_namespace"),
+            py_output=df,
+            sort_output=True,
+            reset_index=True,
+        )
+    finally:
+
+        def cleanup():
+            client = boto3.client("s3tables")
+            client.delete_table(
+                name=table_name,
+                namespace="write_namespace",
+                tableBucketARN=bucket_arn,
+            )
+            client.close()
+
+        run_rank0(cleanup)
diff --git a/docs/docs/iceberg/intro.md b/docs/docs/iceberg/intro.md
@@ -39,6 +39,7 @@ These are the Iceberg catalogs supported in Bodo Python and SQL:
 | Tabular's RESTCatalog | Yes | Yes, via the TabularCatalog | Only tested on S3 |
 | GlueCatalog | Yes | Yes, via TablePath |  |
 | HiveCatalog | Yes | Yes, via TablePath |  |
+| S3 Tables | Yes | No | |
 
 
 ## Limitations and Considerations

diff --git a/docs/docs/iceberg/read_write.md b/docs/docs/iceberg/read_write.md
@@ -103,6 +103,11 @@ The following catalogs are supported:
     - Parameter `token` or `credential` is required for authentication and should be retrieved from the REST catalog provider.
     - E.g. `iceberg+rest` or `iceberg+rest://<rest-uri>?warehouse=<warehouse>&token=<token>`
 
+- S3 Tables
+    - Connection string must be of the form `iceberg+arn:aws:s3tables:<region>:<account_number>:bucket/<bucket>`
+    - `params` is unused
+    - E.g. `iceberg+arn:aws:s3tables:us-west-2:123456789012:bucket/mybucket`
+
 #### Pandas APIs {#iceberg-pandas}
 
 Example code for reading:

diff --git a/iceberg/bodo_iceberg_connector/catalog_conn.py b/iceberg/bodo_iceberg_connector/catalog_conn.py
@@ -5,7 +5,9 @@
 
 from bodo_iceberg_connector.errors import IcebergError, IcebergWarning
 
-CatalogType = Literal["hadoop", "hive", "nessie", "glue", "snowflake", "rest"]
+CatalogType = Literal[
+    "hadoop", "hive", "nessie", "glue", "snowflake", "rest", "s3tables"
+]
 
 
 def _get_first(elems: dict[str, list[str]], param: str) -> str | None:
@@ -64,6 +66,8 @@ def parse_conn_str(
             catalog_type = "snowflake"
         elif parsed_conn.scheme == "rest":
             catalog_type = "rest"
+        elif parsed_conn.scheme == "arn" and "aws:s3tables" in parsed_conn.path:
+            catalog_type = "s3tables"
 
         else:
             types = ", ".join(
@@ -75,6 +79,7 @@ def parse_conn_str(
                     "glue",
                     "snowflake",
                     "rest",
+                    "s3tables",
                 ]
             )
             raise IcebergError(
@@ -90,10 +95,11 @@ def parse_conn_str(
         "glue",
         "snowflake",
         "rest",
+        "s3tables",
     ]
 
     # Get Warehouse Location
-    if catalog_type != "snowflake" and warehouse is None:
+    if catalog_type not in ("snowflake", "s3tables") and warehouse is None:
         warnings.warn(
             "It is recommended that the `warehouse` property is included in the connection string for this type of catalog. Bodo can automatically infer what kind of FileIO to use from the warehouse location. It is also highly recommended to include with Glue and Nessie catalogs.",
             IcebergWarning,

diff --git a/iceberg/bodo_iceberg_connector/iceberg-java/pom.xml b/iceberg/bodo_iceberg_connector/iceberg-java/pom.xml
@@ -18,7 +18,7 @@
     <iceberg.version>1.5.2</iceberg.version>
     <hadoop.version>3.3.3</hadoop.version>
     <aws.old.version>1.12.382</aws.old.version>
-    <aws.version>2.19.13</aws.version>
+    <aws.version>2.29.26</aws.version>
   </properties>
 
   <dependencies>
@@ -227,6 +227,18 @@
       <version>${aws.version}</version>
     </dependency>
 
+    <!-- S3Tables Dependencies -->
+    <dependency>
+      <groupId>software.amazon.awssdk</groupId>
+      <artifactId>s3tables</artifactId>
+      <version>${aws.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>software.amazon.s3tables</groupId>
+      <artifactId>s3-tables-catalog-for-iceberg</artifactId>
+      <version>0.1.3</version>
+    </dependency>
+
     <!-- Logging Dependencies (to avoid duplicate versions) -->
     <!-- SLF4J Complains when different versions are registered -->
     <!-- https://mvnrepository.com/artifact/org.slf4j/slf4j-api -->

diff --git a/...iceberg_connector/iceberg-java/src/main/java/com/bodo/iceberg/catalog/CatalogCreator.java b/...iceberg_connector/iceberg-java/src/main/java/com/bodo/iceberg/catalog/CatalogCreator.java
@@ -51,11 +51,18 @@ public static Triple<Configuration, Map<String, String>, URIBuilder> prepareInpu
   public static Catalog create(String connStr, String catalogType, String coreSitePath)
       throws URISyntaxException {
     // Create Catalog
+    final Catalog catalog;
+
+    // S3Tables doesn't use a URI
+    if (connStr.startsWith("arn:aws:s3tables") && catalogType.equals("s3tables")) {
+      catalog = S3TablesBuilder.create(connStr);
+      return CachingCatalog.wrap(catalog);
+    }
+
     var out = prepareInput(connStr, catalogType, coreSitePath);
     Configuration conf = out.getFirst();
     Map<String, String> params = out.getSecond();
     URIBuilder uriBuilder = out.getThird();
-    final Catalog catalog;
 
     switch (catalogType.toLowerCase()) {
       case "nessie":

diff --git a/...ceberg_connector/iceberg-java/src/main/java/com/bodo/iceberg/catalog/S3TablesBuilder.java b/...ceberg_connector/iceberg-java/src/main/java/com/bodo/iceberg/catalog/S3TablesBuilder.java
@@ -0,0 +1,13 @@
+package com.bodo.iceberg.catalog;
+
+import java.util.Map;
+import org.apache.iceberg.catalog.Catalog;
+import software.amazon.s3tables.iceberg.S3TablesCatalog;
+
+public class S3TablesBuilder {
+  public static Catalog create(String connStr) {
+    S3TablesCatalog catalog = new S3TablesCatalog();
+    catalog.initialize("S3Tables_catalog", Map.of("warehouse", connStr));
+    return catalog;
+  }
+}
diff --git a/iceberg/bodo_iceberg_connector/schema.py b/iceberg/bodo_iceberg_connector/schema.py
@@ -9,11 +9,10 @@
 from py4j.protocol import Py4JError
 
 from bodo_iceberg_connector.catalog_conn import (
-    gen_table_loc,
     normalize_data_loc,
     parse_conn_str,
 )
-from bodo_iceberg_connector.errors import IcebergError, IcebergJavaError
+from bodo_iceberg_connector.errors import IcebergJavaError
 from bodo_iceberg_connector.py4j_support import (
     get_catalog,
     launch_jvm,
@@ -113,12 +112,7 @@ def get_iceberg_info(conn_str: str, schema: str, table: str, error: bool = True)
             iceberg_schema = None
             partition_spec = []
             sort_order = []
-
-            if warehouse is None:
-                raise IcebergError(
-                    "`warehouse` parameter required in connection string"
-                )
-            table_loc = gen_table_loc(catalog_type, warehouse, schema, table)  # type: ignore
+            table_loc = ""
 
         else:
             schema_id: int | None = java_table_info.getSchemaID()