Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix MotherDuck multi-catalog configs #4001

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
46 changes: 28 additions & 18 deletions sqlmesh/core/config/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
# Nullable types are problematic
"clickhouse",
}
MOTHERDUCK_TOKEN_REGEX = re.compile(r"(\?motherduck_token=)(\S*)")
MOTHERDUCK_TOKEN_REGEX = re.compile(r"(\?|\&)(motherduck_token=)(\S*)")


class ConnectionConfig(abc.ABC, BaseConfig):
Expand Down Expand Up @@ -269,16 +269,20 @@ def init(cursor: duckdb.DuckDBPyConnection) -> None:
query = f"ATTACH '{path_options}'"
if not path_options.startswith("md:"):
query += f" AS {alias}"
elif self.token:
query += f"?motherduck_token={self.token}"
cursor.execute(query)
except BinderException as e:
# If a user tries to create a catalog pointing at `:memory:` and with the name `memory`
# then we don't want to raise since this happens by default. They are just doing this to
# set it as the default catalog.
if not (
'database with name "memory" already exists' in str(e)
and path_options == ":memory:"
# If a user tried to attach a MotherDuck database/share which has already by attached via
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# If a user tried to attach a MotherDuck database/share which has already by attached via
# If a user tried to attach a MotherDuck database/share which has already been attached via

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this not an error? Why would anyone specify the same MD catalog more than once?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the way the implementation for MotherDuckConnectionConfig works right now, the db adapter uses md: as the connection string for duckdb, implicitly adding all the databases/shares available to the token account. if you then issue an ATTACH command to a specific db md:some_db, duckdb will emit an error.

per my comment (#4001) above, we could either skip the ATTACH trusting that the db actually exists on the account, or we could check that it's there.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. This makes sense, thank you

# `ATTACH 'md:'`, then we don't want to raise since this is expected.
if (
not (
'database with name "memory" already exists' in str(e)
and path_options == ":memory:"
)
and f"""database with name "{path_options.path.replace('md:', '')}" already exists"""
not in str(e)
):
raise e
if i == 0 and not getattr(self, "database", None):
Expand Down Expand Up @@ -331,7 +335,9 @@ def get_catalog(self) -> t.Optional[str]:
return None

def _mask_motherduck_token(self, string: str) -> str:
return MOTHERDUCK_TOKEN_REGEX.sub(lambda m: f"{m.group(1)}{'*' * len(m.group(2))}", string)
return MOTHERDUCK_TOKEN_REGEX.sub(
lambda m: f"{m.group(1)}{m.group(2)}{'*' * len(m.group(3))}", string
)


class MotherDuckConnectionConfig(BaseDuckDBConnectionConfig):
Expand All @@ -349,19 +355,19 @@ def _static_connection_kwargs(self) -> t.Dict[str, t.Any]:
from sqlmesh import __version__

custom_user_agent_config = {"custom_user_agent": f"SQLMesh/{__version__}"}
if not self.database:
return {"config": custom_user_agent_config}
connection_str = f"md:{self.database or ''}"
connection_str = "md:"
if self.database:
# Attach single MD database instead of all databases on the account
connection_str += f"{self.database}?attach_mode=single"
if self.token:
connection_str += f"?motherduck_token={self.token}"
connection_str += f"{'&' if self.database else '?'}motherduck_token={self.token}"
return {"database": connection_str, "config": custom_user_agent_config}


class DuckDBAttachOptions(BaseConfig):
type: str
path: str
read_only: bool = False
token: t.Optional[str] = None

def to_sql(self, alias: str) -> str:
options = []
Expand All @@ -371,14 +377,18 @@ def to_sql(self, alias: str) -> str:
options.append(f"TYPE {self.type.upper()}")
if self.read_only:
options.append("READ_ONLY")
options_sql = f" ({', '.join(options)})" if options else ""
alias_sql = ""
# TODO: Add support for Postgres schema. Currently adding it blocks access to the information_schema
alias_sql = (
if self.type == "motherduck":
# MotherDuck does not support aliasing
f" AS {alias}" if not (self.type == "motherduck" or self.path.startswith("md:")) else ""
)
options_sql = f" ({', '.join(options)})" if options else ""
token_sql = "?motherduck_token=" + self.token if self.token else ""
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removing because it doesn't actually work

return f"ATTACH '{self.path}{token_sql}'{alias_sql}{options_sql}"
if (md_db := self.path.replace("md:", "")) != alias.replace('"', ""):
raise ConfigError(
f"MotherDuck does not support assigning an alias different from the database name {md_db}."
)
else:
alias_sql += f" AS {alias}"
return f"ATTACH '{self.path}'{alias_sql}{options_sql}"


class DuckDBConnectionConfig(BaseDuckDBConnectionConfig):
Expand Down
52 changes: 38 additions & 14 deletions tests/core/test_connection_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,42 +647,66 @@ def _thread_connection():


def test_motherduck_token_mask(make_config):
config = make_config(
config_1 = make_config(
type="motherduck",
token="short",
database="whodunnit",
)
config_2 = make_config(
type="motherduck",
token="longtoken123456789",
database="whodunnit",
)
config_3 = make_config(
type="motherduck",
token="secret1235",
catalogs={
"test2": DuckDBAttachOptions(
type="motherduck",
path="md:whodunnit?motherduck_token=short",
),
"test1": DuckDBAttachOptions(
type="motherduck", path="md:whodunnit", token="longtoken123456789"
type="motherduck",
path="md:whodunnit",
),
},
)
assert isinstance(config, MotherDuckConnectionConfig)

assert config._mask_motherduck_token(config.catalogs["test1"].path) == "md:whodunnit"
assert isinstance(config_1, MotherDuckConnectionConfig)
assert isinstance(config_2, MotherDuckConnectionConfig)
assert isinstance(config_3, MotherDuckConnectionConfig)
assert config_1._mask_motherduck_token(config_1.database) == "whodunnit"
assert (
config._mask_motherduck_token(config.catalogs["test2"].path)
config_1._mask_motherduck_token(f"md:{config_1.database}?motherduck_token={config_1.token}")
== "md:whodunnit?motherduck_token=*****"
)
assert (
config._mask_motherduck_token("?motherduck_token=secret1235")
config_1._mask_motherduck_token(
f"md:{config_1.database}?attach_mode=single&motherduck_token={config_1.token}"
)
== "md:whodunnit?attach_mode=single&motherduck_token=*****"
)
assert (
config_2._mask_motherduck_token(f"md:{config_2.database}?motherduck_token={config_2.token}")
== "md:whodunnit?motherduck_token=******************"
)
assert (
config_3._mask_motherduck_token(f"md:?motherduck_token={config_3.token}")
== "md:?motherduck_token=**********"
)
assert (
config_1._mask_motherduck_token("?motherduck_token=secret1235")
== "?motherduck_token=**********"
)
assert (
config._mask_motherduck_token("md:whodunnit?motherduck_token=short")
config_1._mask_motherduck_token("md:whodunnit?motherduck_token=short")
== "md:whodunnit?motherduck_token=*****"
)
assert (
config._mask_motherduck_token("md:whodunnit?motherduck_token=longtoken123456789")
config_1._mask_motherduck_token("md:whodunnit?motherduck_token=longtoken123456789")
== "md:whodunnit?motherduck_token=******************"
)
assert (
config._mask_motherduck_token("md:whodunnit?motherduck_token=")
config_1._mask_motherduck_token("md:whodunnit?motherduck_token=")
== "md:whodunnit?motherduck_token="
)
assert config._mask_motherduck_token(":memory:") == ":memory:"
assert config_1._mask_motherduck_token(":memory:") == ":memory:"


def test_bigquery(make_config):
Expand Down