fix: make memtable cleanup tests work (#10283)

ibis-project · Oct 8, 2024 · 11f8921 · 11f8921
1 parent afebf55
commit 11f8921
Show file tree

Hide file tree

Showing 8 changed files with 34 additions and 37 deletions.
diff --git a/compose.yaml b/compose.yaml
@@ -591,7 +591,7 @@ services:
     image: bitnami/spark:3.5.3
     ports:
       - 15002:15002
-    command: /opt/bitnami/spark/sbin/start-connect-server.sh --name ibis_testing --packages org.apache.spark:spark-connect_2.12:3.5.2,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.2,io.delta:delta-core_2.12:2.1.0
+    command: /opt/bitnami/spark/sbin/start-connect-server.sh --name ibis_testing --packages org.apache.spark:spark-connect_2.12:3.5.3,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1,io.delta:delta-core_2.12:2.1.0
     healthcheck:
       test:
         - CMD-SHELL
@@ -601,7 +601,7 @@ services:
     volumes:
       - spark-connect:/data
       - $PWD/docker/spark-connect/conf.properties:/opt/bitnami/spark/conf/spark-defaults.conf:ro
-      # - $PWD/docker/spark-connect/log4j2.properties:/opt/bitnami/spark/conf/log4j2.properties:ro
+      - $PWD/docker/spark-connect/log4j2.properties:/opt/bitnami/spark/conf/log4j2.properties:ro
     networks:
       - spark-connect
 

diff --git a/docker/spark-connect/conf.properties b/docker/spark-connect/conf.properties
@@ -1,6 +1,6 @@
 spark.driver.extraJavaOptions=-Duser.timezone=GMT
 spark.executor.extraJavaOptions=-Duser.timezone=GMT
-spark.jars.packages=org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.2
+spark.jars.packages=org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1
 spark.sql.catalog.local.type=hadoop
 spark.sql.catalog.local.warehouse=warehouse
 spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog

diff --git a/ibis/backends/bigquery/__init__.py b/ibis/backends/bigquery/__init__.py
@@ -698,12 +698,14 @@ def compile(
     ):
         """Compile an Ibis expression to a SQL string."""
         session_dataset = self._session_dataset
+        session_dataset_id = getattr(session_dataset, "dataset_id", None)
+        session_project = getattr(session_dataset, "project", None)
         query = self.compiler.to_sqlglot(
             expr,
             limit=limit,
             params=params,
-            session_dataset_id=getattr(session_dataset, "dataset_id", None),
-            session_project=getattr(session_dataset, "project", None),
+            session_dataset_id=session_dataset_id,
+            session_project=session_project,
             **kwargs,
         )
         queries = query if isinstance(query, list) else [query]

diff --git a/ibis/backends/pyspark/__init__.py b/ibis/backends/pyspark/__init__.py
@@ -448,7 +448,7 @@ def _register_udfs(self, expr: ir.Expr) -> None:
     def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
         schema = PySparkSchema.from_ibis(op.schema)
         df = self._session.createDataFrame(data=op.data.to_frame(), schema=schema)
-        df.createTempView(op.name)
+        df.createOrReplaceTempView(op.name)
 
     def _finalize_memtable(self, name: str) -> None:
         """No-op, otherwise a deadlock can occur when using Spark Connect."""

diff --git a/ibis/backends/snowflake/tests/test_client.py b/ibis/backends/snowflake/tests/test_client.py
@@ -355,32 +355,23 @@ def test_struct_of_json(con):
     assert all(value == raw for value in result.to_pylist())
 
 
-def test_list_tables(con):
-    assert {
-        "ASTRONAUTS",
-        "AWARDS_PLAYERS",
-        "BATTING",
-        "DIAMONDS",
-        "FUNCTIONAL_ALLTYPES",
-    }.issubset(con.list_tables())
-
-    like_table = [
+@pytest.mark.parametrize(
+    "database",
+    ["IBIS_TESTING.INFORMATION_SCHEMA", ("IBIS_TESTING", "INFORMATION_SCHEMA")],
+    ids=["dotted-path", "tuple"],
+)
+def test_list_tables_with_database(con, database):
+    like_table = {
         "EVENT_TABLES",
         "EXTERNAL_TABLES",
+        "HYBRID_TABLES",
         "TABLES",
         "TABLE_CONSTRAINTS",
         "TABLE_PRIVILEGES",
         "TABLE_STORAGE_METRICS",
-    ]
-
-    assert (
-        con.list_tables(database="IBIS_TESTING.INFORMATION_SCHEMA", like="TABLE")
-        == like_table
-    )
-    assert (
-        con.list_tables(database=("IBIS_TESTING", "INFORMATION_SCHEMA"), like="TABLE")
-        == like_table
-    )
+    }
+    tables = con.list_tables(database=database, like="TABLE")
+    assert like_table.issubset(tables)
 
 
 def test_timestamp_memtable(con):

diff --git a/ibis/backends/sql/compilers/bigquery/__init__.py b/ibis/backends/sql/compilers/bigquery/__init__.py
@@ -35,16 +35,15 @@
 _NAME_REGEX = re.compile(r'[^!"$()*,./;?@[\\\]^`{}~\n]+')
 
 
-_MEMTABLE_PATTERN = re.compile(
-    r"^_?ibis_(?:[A-Za-z_][A-Za-z_0-9]*)_memtable_[a-z0-9]{26}$"
-)
-
-
 def _qualify_memtable(
-    node: sge.Expression, *, dataset: str | None, project: str | None
+    node: sge.Expression,
+    *,
+    dataset: str | None,
+    project: str | None,
+    memtable_names: frozenset[str],
 ) -> sge.Expression:
     """Add a BigQuery dataset and project to memtable references."""
-    if isinstance(node, sge.Table) and _MEMTABLE_PATTERN.match(node.name) is not None:
+    if isinstance(node, sge.Table) and node.name in memtable_names:
         node.args["db"] = dataset
         node.args["catalog"] = project
         # make sure to quote table location
@@ -241,10 +240,15 @@ def to_sqlglot(
         table_expr = expr.as_table()
         geocols = table_expr.schema().geospatial
 
+        memtable_names = frozenset(
+            op.name for op in table_expr.op().find(ops.InMemoryTable)
+        )
+
         result = sql.transform(
             _qualify_memtable,
             dataset=session_dataset_id,
             project=session_project,
+            memtable_names=memtable_names,
         ).transform(_remove_null_ordering_from_unsupported_window)
 
         if geocols:

diff --git a/justfile b/justfile
@@ -136,7 +136,7 @@ download-data owner="ibis-project" repo="testing-data" rev="master":
     fi
 
 # download the iceberg jar used for testing pyspark and iceberg integration
-download-iceberg-jar pyspark scala="2.12" iceberg="1.5.2":
+download-iceberg-jar pyspark scala="2.12" iceberg="1.6.1":
     #!/usr/bin/env bash
     set -eo pipefail
 

diff --git a/poetry-overrides.nix b/poetry-overrides.nix
@@ -4,10 +4,10 @@ final: prev: {
       inherit (final) pkgs lib;
       pysparkVersion = lib.versions.majorMinor attrs.version;
       jarHashes = {
-        "3.5" = "sha256-KuxLeNgGzIHU5QMls1H2NJyQ3mQVROZExgMvAAk4YYs=";
-        "3.3" = "sha256-W3ij6mwrIDOc4zGqtpCsbg563qHmdMc8eZnLX6bnl2M=";
+        "3.5" = "sha256-h+cYTzHvDKrEFbvfzxvElDNGpYuY10fcg0NPcTnhKss=";
+        "3.3" = "sha256-3D++9VCiLoMP7jPvdCtBn7xnxqHnyQowcqdGUe0M3mk=";
       };
-      icebergVersion = "1.5.2";
+      icebergVersion = "1.6.1";
       scalaVersion = "2.12";
       jarName = "iceberg-spark-runtime-${pysparkVersion}_${scalaVersion}-${icebergVersion}.jar";
       icebergJarUrl = "https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-${pysparkVersion}_${scalaVersion}/${icebergVersion}/${jarName}";