Refactor by representing info schema views using a single identifier

tobymao · Nov 5, 2024 · 6d12372 · 6d12372
2 parents 143924e + faff23e
commit 6d12372
Show file tree

Hide file tree

Showing 5 changed files with 55 additions and 30 deletions.
diff --git a/sqlglot/dialects/bigquery.py b/sqlglot/dialects/bigquery.py
@@ -561,18 +561,6 @@ def _parse_table_parts(
                 schema=schema, is_db_reference=is_db_reference, wildcard=True
             )
 
-            # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or
-            # dataset, so if the project identifier is omitted we need to fix the ast so that
-            # the `INFORMATION_SCHEMA.X` bit is represented as a Dot. Otherwise, we wouldn't
-            # correctly qualify a `Table` node that references these views, because it would
-            # seem like the "catalog" part is set, when it'd actually be the region/dataset.
-            #
-            # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax
-            if table.db.upper() == "INFORMATION_SCHEMA":
-                table.set("this", exp.Dot.build([table.args["db"].pop(), table.this.pop()]))
-                table.set("db", table.args.get("catalog"))
-                table.set("catalog", None)
-
             # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here
             if not table.catalog:
                 if table.db:
@@ -586,26 +574,50 @@ def _parse_table_parts(
                         table.set("db", exp.Identifier(this=parts[0]))
                         table.set("this", exp.Identifier(this=parts[1]))
 
-            if isinstance(table.this, (exp.Identifier, exp.Dot)) and any(
-                "." in p.name for p in table.parts
-            ):
+            if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts):
+                alias = table.this
                 catalog, db, this, *rest = (
                     exp.to_identifier(p, quoted=True)
                     for p in split_num_words(".".join(p.name for p in table.parts), ".", 3)
                 )
 
-                if db and db.name.upper() == "INFORMATION_SCHEMA":
-                    this = exp.Dot.build([db, this])  # type: ignore
-                    db = catalog
-                    catalog, *rest = rest or [None]
-
                 if rest and this:
                     this = exp.Dot.build([this, *rest])  # type: ignore
 
                 table = exp.Table(
                     this=this, db=db, catalog=catalog, pivots=table.args.get("pivots")
                 )
                 table.meta["quoted_table"] = True
+            else:
+                alias = None
+
+            # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or
+            # dataset, so if the project identifier is omitted we need to fix the ast so that
+            # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier.
+            # Otherwise, we wouldn't correctly qualify a `Table` node that references these
+            # views, because it would seem like the "catalog" part is set, when it'd actually
+            # be the region/dataset. Merging the two identifiers into a single one is done to
+            # avoid producing a 4-part Table reference, which would cause issues in the schema
+            # module, when there are 3-part table names mixed with information schema views.
+            #
+            # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax
+            table_parts = table.parts
+            if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA":
+                # We need to alias the table here to avoid breaking existing qualified columns.
+                # This is expected to be safe, because if there's an actual alias coming up in
+                # the token stream, it will overwrite this one. If there isn't one, we are only
+                # exposing the name that can be used to reference the view explicitly (a no-op).
+                exp.alias_(
+                    table,
+                    t.cast(exp.Identifier, alias or table_parts[-1]),
+                    table=True,
+                    copy=False,
+                )
+
+                info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}"
+                table.set("this", exp.Identifier(this=info_schema_view, quoted=True))
+                table.set("db", seq_get(table_parts, -3))
+                table.set("catalog", seq_get(table_parts, -4))
 
             return table
 

diff --git a/sqlglot/optimizer/qualify_tables.py b/sqlglot/optimizer/qualify_tables.py
@@ -51,7 +51,7 @@ def qualify_tables(
     catalog = exp.parse_identifier(catalog, dialect=dialect) if catalog else None
 
     def _qualify(table: exp.Table) -> None:
-        if isinstance(table.this, (exp.Identifier, exp.Dot)):
+        if isinstance(table.this, exp.Identifier):
             if not table.args.get("db"):
                 table.set("db", db)
             if not table.args.get("catalog") and table.args.get("db"):

diff --git a/tests/dialects/test_bigquery.py b/tests/dialects/test_bigquery.py
@@ -87,7 +87,11 @@ def test_bigquery(self):
         for prefix in ("c.db.", "db.", ""):
             with self.subTest(f"Parsing {prefix}INFORMATION_SCHEMA.X into a Table"):
                 table = self.parse_one(f"`{prefix}INFORMATION_SCHEMA.X`", into=exp.Table)
-                self.assertIsInstance(table.this, exp.Dot)
+                this = table.this
+
+                self.assertIsInstance(this, exp.Identifier)
+                self.assertTrue(this.quoted)
+                self.assertEqual(this.name, "INFORMATION_SCHEMA.X")
 
         table = self.parse_one("x-0._y.z", into=exp.Table)
         self.assertEqual(table.catalog, "x-0")
@@ -203,9 +207,6 @@ def test_bigquery(self):
         self.validate_identity(
             "MERGE INTO dataset.NewArrivals USING (SELECT * FROM UNNEST([('microwave', 10, 'warehouse #1'), ('dryer', 30, 'warehouse #1'), ('oven', 20, 'warehouse #2')])) ON FALSE WHEN NOT MATCHED THEN INSERT ROW WHEN NOT MATCHED BY SOURCE THEN DELETE"
         )
-        self.validate_identity(
-            "SELECT * FROM `SOME_PROJECT_ID.SOME_DATASET_ID.INFORMATION_SCHEMA.SOME_VIEW`"
-        )
         self.validate_identity(
             "SELECT * FROM test QUALIFY a IS DISTINCT FROM b WINDOW c AS (PARTITION BY d)"
         )
@@ -236,6 +237,18 @@ def test_bigquery(self):
         self.validate_identity(
             "CREATE OR REPLACE VIEW test (tenant_id OPTIONS (description='Test description on table creation')) AS SELECT 1 AS tenant_id, 1 AS customer_id",
         )
+        self.validate_identity(
+            "SELECT * FROM `proj.dataset.INFORMATION_SCHEMA.SOME_VIEW`",
+            "SELECT * FROM `proj.dataset.INFORMATION_SCHEMA.SOME_VIEW` AS `proj.dataset.INFORMATION_SCHEMA.SOME_VIEW`",
+        )
+        self.validate_identity(
+            "SELECT * FROM region_or_dataset.INFORMATION_SCHEMA.TABLES",
+            "SELECT * FROM region_or_dataset.`INFORMATION_SCHEMA.TABLES` AS TABLES",
+        )
+        self.validate_identity(
+            "SELECT * FROM proj.region_or_dataset.INFORMATION_SCHEMA.TABLES",
+            "SELECT * FROM proj.region_or_dataset.`INFORMATION_SCHEMA.TABLES` AS TABLES",
+        )
         self.validate_identity(
             "CREATE VIEW `d.v` OPTIONS (expiration_timestamp=TIMESTAMP '2020-01-02T04:05:06.007Z') AS SELECT 1 AS c",
             "CREATE VIEW `d.v` OPTIONS (expiration_timestamp=CAST('2020-01-02T04:05:06.007Z' AS TIMESTAMP)) AS SELECT 1 AS c",

diff --git a/tests/fixtures/optimizer/qualify_tables.sql b/tests/fixtures/optimizer/qualify_tables.sql
@@ -17,22 +17,22 @@ SELECT 1 FROM x.y.z AS z;
 # title: only information schema
 # dialect: bigquery
 SELECT * FROM information_schema.tables;
-SELECT * FROM c.db.information_schema.tables AS tables;
+SELECT * FROM c.db.`information_schema.tables` AS tables;
 
 # title: information schema with db
 # dialect: bigquery
 SELECT * FROM y.information_schema.tables;
-SELECT * FROM c.y.information_schema.tables AS tables;
+SELECT * FROM c.y.`information_schema.tables` AS tables;
 
 # title: information schema with db, catalog
 # dialect: bigquery
 SELECT * FROM x.y.information_schema.tables;
-SELECT * FROM x.y.information_schema.tables AS tables;
+SELECT * FROM x.y.`information_schema.tables` AS tables;
 
 # title: information schema with db, catalog, alias
 # dialect: bigquery
 SELECT * FROM x.y.information_schema.tables AS z;
-SELECT * FROM x.y.information_schema.tables AS z;
+SELECT * FROM x.y.`information_schema.tables` AS z;
 
 # title: redshift unnest syntax, z.a should be a column, not a table
 # dialect: redshift

diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
@@ -315,7 +315,7 @@ def test_qualify_columns(self, logger):
                 ),
                 dialect="bigquery",
             ).sql(),
-            'WITH "x" AS (SELECT "y"."a" AS "a" FROM "DB"."y" AS "y" CROSS JOIN "a"."b"."INFORMATION_SCHEMA"."COLUMNS" AS "COLUMNS") SELECT "x"."a" AS "a" FROM "x" AS "x"',
+            'WITH "x" AS (SELECT "y"."a" AS "a" FROM "DB"."y" AS "y" CROSS JOIN "a"."b"."INFORMATION_SCHEMA.COLUMNS" AS "columns") SELECT "x"."a" AS "a" FROM "x" AS "x"',
         )
 
         self.assertEqual(