Skip to content

Commit

Permalink
Refactor: produce a single identifeir for information schema view
Browse files Browse the repository at this point in the history
  • Loading branch information
georgesittas committed Nov 4, 2024
1 parent 378efbb commit faff23e
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 30 deletions.
53 changes: 32 additions & 21 deletions sqlglot/dialects/bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,18 +561,6 @@ def _parse_table_parts(
schema=schema, is_db_reference=is_db_reference, wildcard=True
)

# The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or
# dataset, so if the project identifier is omitted we need to fix the ast so that
# the `INFORMATION_SCHEMA.X` bit is represented as a Dot. Otherwise, we wouldn't
# correctly qualify a `Table` node that references these views, because it would
# seem like the "catalog" part is set, when it'd actually be the region/dataset.
#
# See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax
if table.db.upper() == "INFORMATION_SCHEMA":
table.set("this", exp.Dot.build([table.args["db"].pop(), table.this.pop()]))
table.set("db", table.args.get("catalog"))
table.set("catalog", None)

# proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here
if not table.catalog:
if table.db:
Expand All @@ -586,26 +574,49 @@ def _parse_table_parts(
table.set("db", exp.Identifier(this=parts[0]))
table.set("this", exp.Identifier(this=parts[1]))

if isinstance(table.this, (exp.Identifier, exp.Dot)) and any(
"." in p.name for p in table.parts
):
if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts):
alias = table.this
catalog, db, this, *rest = (
exp.to_identifier(p, quoted=True)
for p in split_num_words(".".join(p.name for p in table.parts), ".", 3)
)

if db and db.name.upper() == "INFORMATION_SCHEMA":
this = exp.Dot.build([db, this]) # type: ignore
db = catalog
catalog, *rest = rest or [None]

if rest and this:
this = exp.Dot.build([this, *rest]) # type: ignore

table = exp.Table(
this=this, db=db, catalog=catalog, pivots=table.args.get("pivots")
)
table.meta["quoted_table"] = True
else:
alias = None

# The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or
# dataset, so if the project identifier is omitted we need to fix the ast so that
# the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier.
# Otherwise, we wouldn't correctly qualify a `Table` node that references these
# views, because it would seem like the "catalog" part is set, when it'd actually
# be the region/dataset. Merging the two identifiers into a single one is done to
# avoid producing a 4-part Table reference, which would cause issues in the schema
# module, when there are 3-part table names mixed with information schema views.
#
# See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax
table_parts = table.parts
if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA":
# We need to alias the table here to avoid breaking existing qualified columns.
# This is expected to be safe, because if there's an actual alias coming up in
# the token stream, it will overwrite this one. If there isn't one, we are only
# exposing the name that can be used to reference the view explicitly (a no-op).
exp.alias_(
table,
t.cast(exp.Identifier, alias or table_parts[-1]),
table=True,
copy=False,
)

info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}"
table.set("this", exp.Identifier(this=info_schema_view, quoted=True))
table.set("db", seq_get(table_parts, -3))
table.set("catalog", seq_get(table_parts, -4))

return table

Expand Down
21 changes: 17 additions & 4 deletions tests/dialects/test_bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,11 @@ def test_bigquery(self):
for prefix in ("c.db.", "db.", ""):
with self.subTest(f"Parsing {prefix}INFORMATION_SCHEMA.X into a Table"):
table = self.parse_one(f"`{prefix}INFORMATION_SCHEMA.X`", into=exp.Table)
self.assertIsInstance(table.this, exp.Dot)
this = table.this

self.assertIsInstance(this, exp.Identifier)
self.assertTrue(this.quoted)
self.assertEqual(this.name, "INFORMATION_SCHEMA.X")

table = self.parse_one("x-0._y.z", into=exp.Table)
self.assertEqual(table.catalog, "x-0")
Expand Down Expand Up @@ -203,9 +207,6 @@ def test_bigquery(self):
self.validate_identity(
"MERGE INTO dataset.NewArrivals USING (SELECT * FROM UNNEST([('microwave', 10, 'warehouse #1'), ('dryer', 30, 'warehouse #1'), ('oven', 20, 'warehouse #2')])) ON FALSE WHEN NOT MATCHED THEN INSERT ROW WHEN NOT MATCHED BY SOURCE THEN DELETE"
)
self.validate_identity(
"SELECT * FROM `SOME_PROJECT_ID.SOME_DATASET_ID.INFORMATION_SCHEMA.SOME_VIEW`"
)
self.validate_identity(
"SELECT * FROM test QUALIFY a IS DISTINCT FROM b WINDOW c AS (PARTITION BY d)"
)
Expand Down Expand Up @@ -236,6 +237,18 @@ def test_bigquery(self):
self.validate_identity(
"CREATE OR REPLACE VIEW test (tenant_id OPTIONS (description='Test description on table creation')) AS SELECT 1 AS tenant_id, 1 AS customer_id",
)
self.validate_identity(
"SELECT * FROM `proj.dataset.INFORMATION_SCHEMA.SOME_VIEW`",
"SELECT * FROM `proj.dataset.INFORMATION_SCHEMA.SOME_VIEW` AS `proj.dataset.INFORMATION_SCHEMA.SOME_VIEW`",
)
self.validate_identity(
"SELECT * FROM region_or_dataset.INFORMATION_SCHEMA.TABLES",
"SELECT * FROM region_or_dataset.`INFORMATION_SCHEMA.TABLES` AS TABLES",
)
self.validate_identity(
"SELECT * FROM proj.region_or_dataset.INFORMATION_SCHEMA.TABLES",
"SELECT * FROM proj.region_or_dataset.`INFORMATION_SCHEMA.TABLES` AS TABLES",
)
self.validate_identity(
"CREATE VIEW `d.v` OPTIONS (expiration_timestamp=TIMESTAMP '2020-01-02T04:05:06.007Z') AS SELECT 1 AS c",
"CREATE VIEW `d.v` OPTIONS (expiration_timestamp=CAST('2020-01-02T04:05:06.007Z' AS TIMESTAMP)) AS SELECT 1 AS c",
Expand Down
8 changes: 4 additions & 4 deletions tests/fixtures/optimizer/qualify_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,22 @@ SELECT 1 FROM x.y.z AS z;
# title: only information schema
# dialect: bigquery
SELECT * FROM information_schema.tables;
SELECT * FROM c.db.information_schema.tables AS tables;
SELECT * FROM c.db.`information_schema.tables` AS tables;

# title: information schema with db
# dialect: bigquery
SELECT * FROM y.information_schema.tables;
SELECT * FROM c.y.information_schema.tables AS tables;
SELECT * FROM c.y.`information_schema.tables` AS tables;

# title: information schema with db, catalog
# dialect: bigquery
SELECT * FROM x.y.information_schema.tables;
SELECT * FROM x.y.information_schema.tables AS tables;
SELECT * FROM x.y.`information_schema.tables` AS tables;

# title: information schema with db, catalog, alias
# dialect: bigquery
SELECT * FROM x.y.information_schema.tables AS z;
SELECT * FROM x.y.information_schema.tables AS z;
SELECT * FROM x.y.`information_schema.tables` AS z;

# title: redshift unnest syntax, z.a should be a column, not a table
# dialect: redshift
Expand Down
2 changes: 1 addition & 1 deletion tests/test_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ def test_qualify_columns(self, logger):
),
dialect="bigquery",
).sql(),
'WITH "x" AS (SELECT "y"."a" AS "a" FROM "DB"."y" AS "y" CROSS JOIN "a"."b"."INFORMATION_SCHEMA"."COLUMNS" AS "COLUMNS") SELECT "x"."a" AS "a" FROM "x" AS "x"',
'WITH "x" AS (SELECT "y"."a" AS "a" FROM "DB"."y" AS "y" CROSS JOIN "a"."b"."INFORMATION_SCHEMA.COLUMNS" AS "columns") SELECT "x"."a" AS "a" FROM "x" AS "x"',
)

self.assertEqual(
Expand Down

0 comments on commit faff23e

Please sign in to comment.