diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index ec555003be..59cfaaa5fc 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -9,6 +9,7 @@ from databricks.labs.ucx.framework.crawlers import CrawlerBase from databricks.labs.ucx.hive_metastore.tables import TablesCrawler +from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler logger = logging.getLogger(__name__) @@ -21,6 +22,7 @@ class Grant: database: str | None = None table: str | None = None view: str | None = None + udf: str | None = None any_file: bool = False anonymous_function: bool = False @@ -31,6 +33,7 @@ def type_and_key( database: str | None = None, table: str | None = None, view: str | None = None, + udf: str | None = None, any_file: bool = False, anonymous_function: bool = False, ) -> tuple[str, str]: @@ -42,6 +45,10 @@ def type_and_key( catalog = "hive_metastore" if catalog is None else catalog database = "default" if database is None else database return "VIEW", f"{catalog}.{database}.{view}" + if udf is not None: + catalog = "hive_metastore" if catalog is None else catalog + database = "default" if database is None else database + return "FUNCTION", f"{catalog}.{database}.{udf}" if database is not None: catalog = "hive_metastore" if catalog is None else catalog return "DATABASE", f"{catalog}.{database}" @@ -53,7 +60,7 @@ def type_and_key( if catalog is not None: return "CATALOG", catalog msg = ( - f"invalid grant keys: catalog={catalog}, database={database}, view={view}, " + f"invalid grant keys: catalog={catalog}, database={database}, view={view}, udf={udf}" f"any_file={any_file}, anonymous_function={anonymous_function}" ) raise ValueError(msg) @@ -69,6 +76,7 @@ def this_type_and_key(self): database=self.database, table=self.table, view=self.view, + udf=self.udf, any_file=self.any_file, anonymous_function=self.anonymous_function, ) @@ -135,9 +143,13 @@ def uc_grant_sql(self): class GrantsCrawler(CrawlerBase[Grant]): - def __init__(self, tc: TablesCrawler): + def __init__(self, tc: TablesCrawler, udf: UdfsCrawler): + assert tc._backend == udf._backend + assert tc._catalog == udf._catalog + assert tc._schema == udf._schema super().__init__(tc._backend, tc._catalog, tc._schema, "grants", Grant) self._tc = tc + self._udf = udf def snapshot(self) -> Iterable[Grant]: return self._snapshot(partial(self._try_load), partial(self._crawl)) @@ -148,7 +160,7 @@ def _try_load(self): def _crawl(self) -> Iterable[Grant]: """ - Crawls and lists grants for all databases, tables, views, any file + Crawls and lists grants for all databases, tables, views, udfs, any file and anonymous function within hive_metastore. Returns: @@ -159,12 +171,14 @@ def _crawl(self) -> Iterable[Grant]: table/view-specific grants. - Iterates through tables in the specified database using the `_tc.snapshot` method. - For each table, adds tasks to fetch grants for the table or its view, depending on the kind of the table. + - Iterates through udfs in the specified database using the `_udf.snapshot` method. + - For each udf, adds tasks to fetch grants for the udf. - Executes the tasks concurrently using Threads.gather. - Flattens the list of retrieved grant lists into a single list of Grant objects. Note: - The method assumes that the `_grants` method fetches grants based on the provided parameters (catalog, - database, table, view, any file, anonymous function). + database, table, view, udfs, any file, anonymous function). Returns: list[Grant]: A list of Grant objects representing the grants found in hive_metastore. @@ -181,6 +195,9 @@ def _crawl(self) -> Iterable[Grant]: fn = partial(self._grants, catalog=catalog, database=table.database) # views are recognized as tables tasks.append(partial(fn, table=table.name)) + for udf in self._udf.snapshot(): + fn = partial(self._grants, catalog=catalog, database=udf.database) + tasks.append(partial(fn, udf=udf.name)) catalog_grants, errors = Threads.gather(f"listing grants for {catalog}", tasks) if len(errors) > 0: raise ManyError(errors) @@ -206,6 +223,7 @@ def _grants( database: str | None = None, table: str | None = None, view: str | None = None, + udf: str | None = None, any_file: bool = False, anonymous_function: bool = False, ) -> list[Grant]: @@ -217,6 +235,7 @@ def _grants( database (str | None): The database name (optional). table (str | None): The table name (optional). view (str | None): The view name (optional). + udf (str | None): The udf name (optional). any_file (bool): Whether to include any file grants (optional). anonymous_function (bool): Whether to include anonymous function grants (optional). @@ -245,13 +264,12 @@ def _grants( database=self._try_valid(database), table=self._try_valid(table), view=self._try_valid(view), + udf=self._try_valid(udf), any_file=any_file, anonymous_function=anonymous_function, ) try: grants = [] - # Added ANY FILE and ANONYMOUS FUNCTION in object_type_normalization - # to capture the same in grants. issue:#623 object_type_normalization = { "SCHEMA": "DATABASE", "CATALOG$": "CATALOG", @@ -271,6 +289,7 @@ def _grants( action_type=action_type, table=table, view=view, + udf=udf, database=database, catalog=catalog, any_file=any_file, diff --git a/src/databricks/labs/ucx/hive_metastore/udfs.py b/src/databricks/labs/ucx/hive_metastore/udfs.py new file mode 100644 index 0000000000..9731b1bf98 --- /dev/null +++ b/src/databricks/labs/ucx/hive_metastore/udfs.py @@ -0,0 +1,104 @@ +import logging +from collections.abc import Iterable, Iterator +from dataclasses import dataclass +from functools import partial + +from databricks.labs.blueprint.parallel import Threads + +from databricks.labs.ucx.framework.crawlers import CrawlerBase, SqlBackend +from databricks.labs.ucx.mixins.sql import Row + +logger = logging.getLogger(__name__) + + +@dataclass +class Udf: + catalog: str + database: str + name: str + func_type: str + func_input: str + func_returns: str + deterministic: bool + data_access: str + body: str + comment: str = "" + + @property + def key(self) -> str: + return f"{self.catalog}.{self.database}.{self.name}".lower() + + +class UdfsCrawler(CrawlerBase): + def __init__(self, backend: SqlBackend, schema): + """ + Initializes a UdfsCrawler instance. + + Args: + backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark) + schema: The schema name for the inventory persistence. + """ + super().__init__(backend, "hive_metastore", schema, "udfs", Udf) + + def _all_databases(self) -> Iterator[Row]: + yield from self._fetch("SHOW DATABASES") + + def snapshot(self) -> list[Udf]: + """ + Takes a snapshot of tables in the specified catalog and database. + + Returns: + list[Udf]: A list of Udf objects representing the snapshot of tables. + """ + return self._snapshot(self._try_load, self._crawl) + + def _try_load(self) -> Iterable[Udf]: + """Tries to load udf information from the database or throws TABLE_OR_VIEW_NOT_FOUND error""" + for row in self._fetch(f"SELECT * FROM {self._full_name}"): + yield Udf(*row) + + def _crawl(self) -> Iterable[Udf]: + """Crawls and lists udfs within the specified catalog and database.""" + tasks = [] + catalog = "hive_metastore" + # need to set the current catalog otherwise "SHOW USER FUNCTIONS FROM" is raising error: + # "target schema is not in the current catalog" + self._exec(f"USE CATALOG {catalog};") + for (database,) in self._all_databases(): + logger.debug(f"[{catalog}.{database}] listing udfs") + for (udf,) in self._fetch(f"SHOW USER FUNCTIONS FROM {catalog}.{database};"): + if udf.startswith(f"{catalog}.{database}"): + udf_name = udf[udf.rfind(".") + 1 :] # remove catalog and database info from the name + tasks.append(partial(self._describe, catalog, database, udf_name)) + catalog_tables, errors = Threads.gather(f"listing udfs in {catalog}", tasks) + if len(errors) > 0: + logger.error(f"Detected {len(errors)} while scanning udfs in {catalog}") + return catalog_tables + + def _describe(self, catalog: str, database: str, udf: str) -> Udf | None: + """Fetches metadata like udf type, input, returns, data access and body + if specified for a specific udf within the given catalog and database. + """ + full_name = f"{catalog}.{database}.{udf}" + try: + logger.debug(f"[{full_name}] fetching udf metadata") + describe = {} + for key_value in self._fetch(f"DESCRIBE FUNCTION EXTENDED {full_name}"): + if ":" in key_value: # skip free text configs that don't have a key + key, value = key_value.split(":") + describe[key] = value.strip() + return Udf( + catalog=catalog.lower(), + database=database.lower(), + name=udf.lower(), + func_type=describe.get("Type", "UNKNOWN"), + func_input=describe.get("Input", "UNKNOWN"), + func_returns=describe.get("Returns", "UNKNOWN"), + deterministic=describe.get("Deterministic", False), + data_access=describe.get("Type", "UNKNOWN"), + comment=describe.get("Comment", "UNKNOWN"), + body=describe.get("Body", "UNKNOWN"), + ) + except Exception as e: + logger.error(f"Couldn't fetch information for udf {full_name} : {e}") + return None diff --git a/src/databricks/labs/ucx/mixins/fixtures.py b/src/databricks/labs/ucx/mixins/fixtures.py index 64f92c3165..0dc2796a52 100644 --- a/src/databricks/labs/ucx/mixins/fixtures.py +++ b/src/databricks/labs/ucx/mixins/fixtures.py @@ -15,12 +15,13 @@ import pytest from databricks.sdk import AccountClient, WorkspaceClient from databricks.sdk.core import DatabricksError -from databricks.sdk.errors import ResourceConflict +from databricks.sdk.errors import NotFound, ResourceConflict from databricks.sdk.retries import retried from databricks.sdk.service import compute, iam, jobs, pipelines, sql, workspace from databricks.sdk.service.catalog import ( CatalogInfo, DataSourceFormat, + FunctionInfo, SchemaInfo, TableInfo, TableType, @@ -1023,6 +1024,45 @@ def remove(table_info: TableInfo): yield from factory("table", create, remove) +@pytest.fixture +def make_udf(sql_backend, make_schema, make_random) -> Generator[Callable[..., FunctionInfo], None, None]: + def create( + *, catalog_name="hive_metastore", schema_name: str | None = None, name: str | None = None + ) -> FunctionInfo: + if schema_name is None: + schema = make_schema(catalog_name=catalog_name) + catalog_name = schema.catalog_name + schema_name = schema.name + + if name is None: + name = f"ucx_T{make_random(4)}".lower() + + full_name = f"{catalog_name}.{schema_name}.{name}".lower() + ddl = f"CREATE FUNCTION {full_name}(x INT) RETURNS FLOAT CONTAINS SQL DETERMINISTIC RETURN 0;" + + sql_backend.execute(ddl) + udf_info = FunctionInfo( + catalog_name=catalog_name, + schema_name=schema_name, + name=name, + full_name=full_name, + ) + + logger.info(f"Function {udf_info.full_name} crated") + return udf_info + + def remove(udf_info: FunctionInfo): + try: + sql_backend.execute(f"DROP FUNCTION IF EXISTS {udf_info.full_name}") + except NotFound as e: + if "SCHEMA_NOT_FOUND" in str(e): + logger.warning("Schema was already dropped while executing the test", exc_info=e) + else: + raise e + + yield from factory("table", create, remove) + + @pytest.fixture def make_query(ws, make_table, make_random): def create() -> QueryInfo: diff --git a/src/databricks/labs/ucx/queries/views/grant_detail.sql b/src/databricks/labs/ucx/queries/views/grant_detail.sql index 5905b03700..3994584320 100644 --- a/src/databricks/labs/ucx/queries/views/grant_detail.sql +++ b/src/databricks/labs/ucx/queries/views/grant_detail.sql @@ -6,6 +6,7 @@ SELECT WHEN table IS NOT NULL THEN 'TABLE' WHEN database IS NOT NULL THEN 'DATABASE' WHEN catalog IS NOT NULL THEN 'CATALOG' + WHEN udf IS NOT NULL THEN 'UDF' ELSE 'UNKNOWN' END AS object_type, CASE @@ -15,6 +16,7 @@ SELECT WHEN table IS NOT NULL THEN CONCAT(catalog, '.', database, '.', table) WHEN database IS NOT NULL THEN CONCAT(catalog, '.', database) WHEN catalog IS NOT NULL THEN catalog + WHEN udf IS NOT NULL THEN CONCAT(catalog, '.', database, '.', udf) ELSE 'UNKNOWN' END AS object_id, action_type, @@ -28,5 +30,6 @@ SELECT principal, catalog, database, - table + table, + udf FROM $inventory.grants where database != split("$inventory",'[.]')[1] \ No newline at end of file diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py index 4f427b420a..aca179c282 100644 --- a/src/databricks/labs/ucx/runtime.py +++ b/src/databricks/labs/ucx/runtime.py @@ -19,6 +19,7 @@ TablesCrawler, ) from databricks.labs.ucx.hive_metastore.table_size import TableSizeCrawler +from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler from databricks.labs.ucx.workspace_access.generic import WorkspaceListing from databricks.labs.ucx.workspace_access.groups import GroupManager from databricks.labs.ucx.workspace_access.manager import PermissionManager @@ -53,7 +54,8 @@ def crawl_grants(cfg: WorkspaceConfig): ACLs enabled and available for retrieval.""" backend = RuntimeBackend() tables = TablesCrawler(backend, cfg.inventory_database) - grants = GrantsCrawler(tables) + udfs = UdfsCrawler(backend, cfg.inventory_database) + grants = GrantsCrawler(tables, udfs) grants.snapshot() diff --git a/src/databricks/labs/ucx/workspace_access/manager.py b/src/databricks/labs/ucx/workspace_access/manager.py index dd9aa25ba5..07173b96be 100644 --- a/src/databricks/labs/ucx/workspace_access/manager.py +++ b/src/databricks/labs/ucx/workspace_access/manager.py @@ -15,6 +15,7 @@ SqlBackend, ) from databricks.labs.ucx.hive_metastore import GrantsCrawler, TablesCrawler +from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler from databricks.labs.ucx.workspace_access import generic, redash, scim, secrets from databricks.labs.ucx.workspace_access.base import AclSupport, Permissions from databricks.labs.ucx.workspace_access.groups import MigrationState @@ -71,7 +72,8 @@ def factory( secrets_support = secrets.SecretScopesSupport(ws) scim_support = scim.ScimSupport(ws) tables_crawler = TablesCrawler(sql_backend, inventory_database) - grants_crawler = GrantsCrawler(tables_crawler) + udfs_crawler = UdfsCrawler(sql_backend, inventory_database) + grants_crawler = GrantsCrawler(tables_crawler, udfs_crawler) tacl_support = TableAclSupport(grants_crawler, sql_backend) return cls( sql_backend, inventory_database, [generic_support, sql_support, secrets_support, scim_support, tacl_support] diff --git a/src/databricks/labs/ucx/workspace_access/tacl.py b/src/databricks/labs/ucx/workspace_access/tacl.py index 5b014a34f2..e774d365cd 100644 --- a/src/databricks/labs/ucx/workspace_access/tacl.py +++ b/src/databricks/labs/ucx/workspace_access/tacl.py @@ -65,6 +65,9 @@ def _from_reduced(self, object_type: str, object_id: str, principal: str, action case "CATALOG": catalog = object_id return Grant(principal=principal, action_type=action_type, catalog=catalog) + case "FUNCTION": + catalog, database, udf = object_id.split(".") + return Grant(principal=principal, action_type=action_type, catalog=catalog, database=database, udf=udf) case "ANONYMOUS FUNCTION": catalog = object_id return Grant(principal=principal, action_type=action_type, catalog=catalog, anonymous_function=True) @@ -73,7 +76,7 @@ def _from_reduced(self, object_type: str, object_id: str, principal: str, action return Grant(principal=principal, action_type=action_type, catalog=catalog, any_file=True) def object_types(self) -> set[str]: - return {"TABLE", "DATABASE", "VIEW", "CATALOG", "ANONYMOUS FUNCTION", "ANY FILE"} + return {"TABLE", "DATABASE", "VIEW", "CATALOG", "FUNCTION", "ANONYMOUS FUNCTION", "ANY FILE"} def get_apply_task(self, item: Permissions, migration_state: MigrationState): grant = Grant(**json.loads(item.raw)) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 7822c89617..f7d48f7439 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -10,12 +10,13 @@ from databricks.sdk.core import Config from databricks.sdk.errors import NotFound from databricks.sdk.retries import retried -from databricks.sdk.service.catalog import TableInfo +from databricks.sdk.service.catalog import FunctionInfo, TableInfo from databricks.labs.ucx.framework.crawlers import SqlBackend from databricks.labs.ucx.hive_metastore import TablesCrawler from databricks.labs.ucx.hive_metastore.mapping import Rule, TableMapping from databricks.labs.ucx.hive_metastore.tables import Table +from databricks.labs.ucx.hive_metastore.udfs import Udf, UdfsCrawler from databricks.labs.ucx.mixins.fixtures import * # noqa: F403 from databricks.labs.ucx.workspace_access.groups import MigratedGroup @@ -136,6 +137,29 @@ def snapshot(self) -> list[Table]: return self._tables +class StaticUdfsCrawler(UdfsCrawler): + def __init__(self, sql_backend: SqlBackend, schema: str, udfs: list[FunctionInfo]): + super().__init__(sql_backend, schema) + self._udfs = [ + Udf( + catalog=_.catalog_name, + database=_.schema_name, + name=_.name, + body="5", + comment="_", + data_access="CONTAINS SQL", + deterministic=True, + func_input="STRING", + func_returns="INT", + func_type="SQL", + ) + for _ in udfs + ] + + def snapshot(self) -> list[Udf]: + return self._udfs + + class StaticTableMapping(TableMapping): def __init__( self, ws: WorkspaceClient, backend: SqlBackend, folder: str | None = None, rules: list[Rule] | None = None diff --git a/tests/integration/hive_metastore/test_grants.py b/tests/integration/hive_metastore/test_grants.py index 955b967b23..81adbe103d 100644 --- a/tests/integration/hive_metastore/test_grants.py +++ b/tests/integration/hive_metastore/test_grants.py @@ -1,4 +1,5 @@ import logging +from collections import defaultdict from datetime import timedelta from databricks.sdk.errors import NotFound @@ -6,7 +7,7 @@ from databricks.labs.ucx.hive_metastore import GrantsCrawler -from ..conftest import StaticTablesCrawler +from ..conftest import StaticTablesCrawler, StaticUdfsCrawler logger = logging.getLogger(__name__) @@ -36,7 +37,8 @@ def test_all_grants_in_databases(sql_backend, inventory_schema, make_schema, mak # 20 seconds less than TablesCrawler(sql_backend, inventory_schema) tables = StaticTablesCrawler(sql_backend, inventory_schema, [table_a, table_b, view_c, view_d, table_e]) - grants = GrantsCrawler(tables) + udfs = StaticUdfsCrawler(sql_backend, inventory_schema, []) + grants = GrantsCrawler(tables, udfs) all_grants = {} for grant in grants.snapshot(): @@ -53,3 +55,27 @@ def test_all_grants_in_databases(sql_backend, inventory_schema, make_schema, mak assert all_grants[f"{group_b.display_name}.{view_c.full_name}"] == "MODIFY" assert all_grants[f"{group_b.display_name}.{view_d.full_name}"] == "MODIFY" assert all_grants[f"{group_b.display_name}.{table_e.full_name}"] == "MODIFY" + + +@retried(on=[NotFound], timeout=timedelta(minutes=3)) +def test_all_grants_for_udfs_in_databases(sql_backend, inventory_schema, make_schema, make_udf, make_group): + group = make_group() + schema = make_schema() + udf_a = make_udf(schema_name=schema.name) + udf_b = make_udf(schema_name=schema.name) + + sql_backend.execute(f"GRANT SELECT ON FUNCTION {udf_a.full_name} TO `{group.display_name}`") + sql_backend.execute(f"GRANT READ_METADATA ON FUNCTION {udf_a.full_name} TO `{group.display_name}`") + sql_backend.execute(f"ALTER FUNCTION {udf_a.full_name} OWNER TO `{group.display_name}`") + sql_backend.execute(f"GRANT ALL PRIVILEGES ON FUNCTION {udf_b.full_name} TO `{group.display_name}`") + + tables = StaticTablesCrawler(sql_backend, inventory_schema, []) + udfs = StaticUdfsCrawler(sql_backend, inventory_schema, [udf_a, udf_b]) + grants = GrantsCrawler(tables, udfs) + + actual_grants = defaultdict(set) + for grant in grants.snapshot(): + actual_grants[f"{grant.principal}.{grant.object_key}"].add(grant.action_type) + + assert {"SELECT", "READ_METADATA", "OWN"} == actual_grants[f"{group.display_name}.{udf_a.full_name}"] + assert {"SELECT", "READ_METADATA"} == actual_grants[f"{group.display_name}.{udf_b.full_name}"] diff --git a/tests/integration/hive_metastore/test_udfs.py b/tests/integration/hive_metastore/test_udfs.py new file mode 100644 index 0000000000..9f59db86f7 --- /dev/null +++ b/tests/integration/hive_metastore/test_udfs.py @@ -0,0 +1,28 @@ +import logging +from datetime import timedelta + +from databricks.sdk.errors import NotFound +from databricks.sdk.retries import retried + +from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler + +logger = logging.getLogger(__name__) + + +@retried(on=[NotFound], timeout=timedelta(minutes=2)) +def test_describe_all_udfs_in_databases(ws, sql_backend, inventory_schema, make_schema, make_udf): + schema_a = make_schema(catalog_name="hive_metastore") + make_schema(catalog_name="hive_metastore") + udf_a = make_udf(schema_name=schema_a.name) + udf_b = make_udf(schema_name=schema_a.name) + + udfs_crawler = UdfsCrawler(sql_backend, inventory_schema) + actual_grants = udfs_crawler.snapshot() + + unique_udf_grants = { + grant.name + for grant in actual_grants + if f"{grant.catalog}.{grant.database}.{grant.name}" in [udf_a.full_name, udf_b.full_name] + } + + assert len(unique_udf_grants) == 2 diff --git a/tests/integration/workspace_access/test_groups.py b/tests/integration/workspace_access/test_groups.py index 62c27807b1..8ef2ec4cb6 100644 --- a/tests/integration/workspace_access/test_groups.py +++ b/tests/integration/workspace_access/test_groups.py @@ -17,7 +17,7 @@ from databricks.labs.ucx.workspace_access.manager import PermissionManager from databricks.labs.ucx.workspace_access.tacl import TableAclSupport -from ..conftest import StaticTablesCrawler +from ..conftest import StaticTablesCrawler, StaticUdfsCrawler logger = logging.getLogger(__name__) @@ -229,7 +229,8 @@ def test_replace_workspace_groups_with_account_groups( sql_backend.execute(f"GRANT SELECT, MODIFY ON TABLE {dummy_table.full_name} TO `{ws_group.display_name}`") tables = StaticTablesCrawler(sql_backend, inventory_schema, [dummy_table]) - grants = GrantsCrawler(tables) + udfs = StaticUdfsCrawler(sql_backend, inventory_schema, []) + grants = GrantsCrawler(tables, udfs) @retried(on=[AssertionError], timeout=timedelta(seconds=30)) def assert_table_has_two_grants(): diff --git a/tests/integration/workspace_access/test_tacl.py b/tests/integration/workspace_access/test_tacl.py index aa8b4531b0..c9ebadf7d7 100644 --- a/tests/integration/workspace_access/test_tacl.py +++ b/tests/integration/workspace_access/test_tacl.py @@ -1,10 +1,11 @@ import logging +from collections import defaultdict from databricks.labs.ucx.hive_metastore import GrantsCrawler from databricks.labs.ucx.workspace_access.groups import MigratedGroup from databricks.labs.ucx.workspace_access.tacl import TableAclSupport -from ..conftest import StaticTablesCrawler +from ..conftest import StaticTablesCrawler, StaticUdfsCrawler from . import apply_tasks logger = logging.getLogger(__name__) @@ -20,7 +21,8 @@ def test_permission_for_files_anonymous_func(sql_backend, inventory_schema, make sql_backend.execute(f"GRANT SELECT ON ANONYMOUS FUNCTION TO `{old.display_name}`") tables = StaticTablesCrawler(sql_backend, inventory_schema, []) - grants = GrantsCrawler(tables) + udfs = StaticUdfsCrawler(sql_backend, inventory_schema, []) + grants = GrantsCrawler(tables, udfs) tacl_support = TableAclSupport(grants, sql_backend) apply_tasks(tacl_support, [MigratedGroup.partial_info(old, new)]) @@ -70,7 +72,8 @@ def test_hms2hms_owner_permissions(sql_backend, inventory_schema, make_schema, m sql_backend.execute(f"GRANT SELECT, MODIFY ON TABLE {table_c.full_name} TO `{third.name_in_workspace}`") tables = StaticTablesCrawler(sql_backend, inventory_schema, [table_a, table_b, table_c]) - grants = GrantsCrawler(tables) + udfs = StaticUdfsCrawler(sql_backend, inventory_schema, []) + grants = GrantsCrawler(tables, udfs) original_table_grants = { "a": grants.for_table_info(table_a), @@ -121,3 +124,39 @@ def test_hms2hms_owner_permissions(sql_backend, inventory_schema, make_schema, m "SELECT", "USAGE", }, second.name_in_account + + +def test_permission_for_udfs(sql_backend, inventory_schema, make_schema, make_udf, make_group_pair): + group = make_group_pair() + schema = make_schema() + udf_a = make_udf(schema_name=schema.name) + udf_b = make_udf(schema_name=schema.name) + + sql_backend.execute(f"GRANT SELECT ON FUNCTION {udf_a.full_name} TO `{group.name_in_workspace}`") + sql_backend.execute(f"ALTER FUNCTION {udf_a.full_name} OWNER TO `{group.name_in_workspace}`") + sql_backend.execute(f"GRANT READ_METADATA ON FUNCTION {udf_b.full_name} TO `{group.name_in_workspace}`") + + tables = StaticTablesCrawler(sql_backend, inventory_schema, []) + udfs = StaticUdfsCrawler(sql_backend, inventory_schema, [udf_a, udf_b]) + grants = GrantsCrawler(tables, udfs) + + all_initial_grants = set() + for grant in grants.snapshot(): + all_initial_grants.add(f"{grant.principal}.{grant.object_key}:{grant.action_type}") + + assert f"{group.name_in_workspace}.{udf_a.full_name}:SELECT" in all_initial_grants + assert f"{group.name_in_workspace}.{udf_a.full_name}:OWN" in all_initial_grants + assert f"{group.name_in_workspace}.{udf_b.full_name}:READ_METADATA" in all_initial_grants + + tacl_support = TableAclSupport(grants, sql_backend) + apply_tasks(tacl_support, [group]) + + actual_udf_a_grants = defaultdict(set) + for grant in grants._grants(catalog=schema.catalog_name, database=schema.name, udf=udf_a.name): + actual_udf_a_grants[grant.principal].add(grant.action_type) + assert {"SELECT", "OWN"} == actual_udf_a_grants[group.name_in_account] + + actual_udf_b_grants = defaultdict(set) + for grant in grants._grants(catalog=schema.catalog_name, database=schema.name, udf=udf_b.name): + actual_udf_b_grants[grant.principal].add(grant.action_type) + assert {"READ_METADATA"} == actual_udf_b_grants[group.name_in_account] diff --git a/tests/unit/hive_metastore/test_grants.py b/tests/unit/hive_metastore/test_grants.py index bba04d4ef1..0b049ba1a4 100644 --- a/tests/unit/hive_metastore/test_grants.py +++ b/tests/unit/hive_metastore/test_grants.py @@ -2,6 +2,7 @@ from databricks.labs.ucx.hive_metastore.grants import Grant, GrantsCrawler from databricks.labs.ucx.hive_metastore.tables import TablesCrawler +from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler from databricks.labs.ucx.mixins.sql import Row from ..framework.mocks import MockBackend @@ -61,6 +62,15 @@ def test_type_and_key_anonymous_function(): assert grant.object_key == "" +def test_type_and_key_udf(): + grant = Grant.type_and_key(catalog="hive_metastore", database="mydb", udf="myfunction") + assert grant == ("FUNCTION", "hive_metastore.mydb.myfunction") + + grant = Grant(principal="user", action_type="SELECT", catalog="hive_metastore", database="mydb", udf="myfunction") + assert grant.this_type_and_key()[0] == "FUNCTION" + assert grant.object_key == "hive_metastore.mydb.myfunction" + + def test_type_and_key_invalid(): with pytest.raises(ValueError): Grant.type_and_key() @@ -87,6 +97,11 @@ def test_hive_database_own_sql(): assert grant.hive_grant_sql() == ["ALTER DATABASE hive_metastore.mydb OWNER TO `user`"] +def test_hive_udf_own_sql(): + grant = Grant(principal="user", action_type="OWN", catalog="hive_metastore", database="mydb", udf="myfunction") + assert grant.hive_grant_sql() == ["ALTER FUNCTION hive_metastore.mydb.myfunction OWNER TO `user`"] + + def test_hive_revoke_sql(): grant = Grant(principal="user", action_type="SELECT", catalog="hive_metastore", database="mydb", table="mytable") assert grant.hive_revoke_sql() == "REVOKE SELECT ON TABLE hive_metastore.mydb.mytable FROM `user`" @@ -111,6 +126,10 @@ def test_hive_revoke_sql(): Grant("me", "INVALID", catalog="hive_metastore", database="mydb"), None, ), + ( + Grant("me", "SELECT", catalog="hive_metastore", database="mydb", udf="myfunction"), + "GRANT EXECUTE ON FUNCTION hive_metastore.mydb.myfunction TO `me`", + ), ], ) def test_uc_sql(grant, query): @@ -150,7 +169,8 @@ def make_row(data, columns): def test_crawler_no_data(): b = MockBackend() table = TablesCrawler(b, "schema") - crawler = GrantsCrawler(table) + udf = UdfsCrawler(b, "schema") + crawler = GrantsCrawler(table, udf) grants = crawler.snapshot() assert len(grants) == 0 @@ -186,27 +206,86 @@ def test_crawler_crawl(): } ) table = TablesCrawler(b, "schema") - crawler = GrantsCrawler(table) + udf = UdfsCrawler(b, "schema") + crawler = GrantsCrawler(table, udf) grants = crawler.snapshot() assert len(grants) == 3 -def test_crawler_snapshot(): - # Test with no data +def test_crawler_udf_crawl(): + b = MockBackend( + rows={ + "SHOW DATABASES": [ + make_row(("database_one",), ["databaseName"]), + ], + "SHOW USER FUNCTIONS FROM hive_metastore.database_one": [ + make_row(("hive_metastore.database_one.function_one",), ["function"]), + make_row(("hive_metastore.database_one.function_two",), ["function"]), + ], + "DESCRIBE FUNCTION EXTENDED hive_metastore.database_one.*": [ + ("Type: SCALAR"), + ("Input: p INT"), + ("Returns: FLOAT"), + ("Deterministic: true"), + ("Data Access: CONTAINS SQL"), + ("Body: 1"), + ("ignore"), + ], + "SHOW GRANTS ON .*": [ + make_row(("princ1", "SELECT", "FUNCTION", "ignored"), SHOW_COLS), + ], + } + ) + + table = TablesCrawler(b, "schema") + udf = UdfsCrawler(b, "schema") + crawler = GrantsCrawler(table, udf) + grants = crawler.snapshot() + + assert len(grants) == 2 + assert Grant( + principal="princ1", + action_type="SELECT", + catalog="hive_metastore", + database="database_one", + table=None, + view=None, + udf="function_one", + any_file=False, + anonymous_function=False, + ) == next(g for g in grants if g.udf == "function_one") + assert Grant( + principal="princ1", + action_type="SELECT", + catalog="hive_metastore", + database="database_one", + table=None, + view=None, + udf="function_two", + any_file=False, + anonymous_function=False, + ) == next(g for g in grants if g.udf == "function_two") + + +def test_crawler_snapshot_when_no_data(): b = MockBackend() table = TablesCrawler(b, "schema") - crawler = GrantsCrawler(table) + udf = UdfsCrawler(b, "schema") + crawler = GrantsCrawler(table, udf) snapshot = crawler.snapshot() assert len(snapshot) == 0 - # Test with test data + + +def test_crawler_snapshot_with_data(): b = MockBackend(rows=ROWS) table = TablesCrawler(b, "schema") - crawler = GrantsCrawler(table) + udf = UdfsCrawler(b, "schema") + crawler = GrantsCrawler(table, udf) snapshot = crawler.snapshot() assert len(snapshot) == 3 -def test_grants_returning_error_when_describing(): +def test_grants_returning_error_when_showing_grants(): errors = {"SHOW GRANTS ON TABLE hive_metastore.test_database.table1": "error"} rows = { "SHOW DATABASES": [ @@ -224,8 +303,10 @@ def test_grants_returning_error_when_describing(): ], } - tc = TablesCrawler(MockBackend(fails_on_first=errors, rows=rows), "default") - crawler = GrantsCrawler(tc) + backend = MockBackend(fails_on_first=errors, rows=rows) + tc = TablesCrawler(backend, "default") + udf = UdfsCrawler(backend, "default") + crawler = GrantsCrawler(tc, udf) results = crawler._crawl() assert results == [ @@ -239,3 +320,112 @@ def test_grants_returning_error_when_describing(): anonymous_function=False, ) ] + + +def test_grants_returning_error_when_describing(): + errors = {"DESCRIBE TABLE EXTENDED hive_metastore.test_database.table1": "error"} + rows = { + "SHOW DATABASES": [ + make_row(("test_database",), ["databaseName"]), + ], + "SHOW TABLES FROM hive_metastore.test_database": [ + ("test_database", "table1", False), + ("test_database", "table2", False), + ], + "SHOW GRANTS ON TABLE hive_metastore.test_database.table2": [("principal1", "OWNER", "TABLE", "")], + "DESCRIBE *": [ + ("Catalog", "catalog", ""), + ("Type", "delta", ""), + ], + } + + backend = MockBackend(fails_on_first=errors, rows=rows) + tc = TablesCrawler(backend, "default") + udf = UdfsCrawler(backend, "default") + crawler = GrantsCrawler(tc, udf) + + results = crawler._crawl() + assert results == [ + Grant( + principal="principal1", + action_type="OWNER", + catalog="hive_metastore", + database="test_database", + table="table2", + any_file=False, + anonymous_function=False, + ) + ] + + +def test_udf_grants_returning_error_when_showing_grants(): + errors = {"SHOW GRANTS ON FUNCTION hive_metastore.test_database.function_bad": "error"} + rows = { + "SHOW DATABASES": [ + make_row(("test_database",), ["databaseName"]), + make_row(("other_database",), ["databaseName"]), + ], + "SHOW USER FUNCTIONS FROM hive_metastore.test_database": [ + make_row(("hive_metastore.test_database.function_bad",), ["function"]), + make_row(("hive_metastore.test_database.function_good",), ["function"]), + ], + "SHOW GRANTS ON FUNCTION hive_metastore.test_database.function_good": [("principal1", "OWN", "FUNCTION", "")], + "DESCRIBE *": [ + ("Type: SCALAR"), + ("Body: 1"), + ], + } + + backend = MockBackend(fails_on_first=errors, rows=rows) + tc = TablesCrawler(backend, "default") + udf = UdfsCrawler(backend, "default") + crawler = GrantsCrawler(tc, udf) + + results = crawler._crawl() + assert results == [ + Grant( + principal="principal1", + action_type="OWN", + catalog="hive_metastore", + database="test_database", + udf="function_good", + any_file=False, + anonymous_function=False, + ) + ] + + +def test_udf_grants_returning_error_when_describing(): + errors = {"DESCRIBE FUNCTION EXTENDED hive_metastore.test_database.function_bad": "error"} + rows = { + "SHOW DATABASES": [ + make_row(("test_database",), ["databaseName"]), + ], + "SHOW USER FUNCTIONS FROM hive_metastore.test_database": [ + make_row(("hive_metastore.test_database.function_bad",), ["function"]), + make_row(("hive_metastore.test_database.function_good",), ["function"]), + ], + "SHOW GRANTS ON FUNCTION hive_metastore.test_database.function_good": [("principal1", "OWN", "FUNCTION", "")], + "DESCRIBE *": [ + ("Type: SCALAR"), + ("Body: 1"), + ], + } + + backend = MockBackend(fails_on_first=errors, rows=rows) + tc = TablesCrawler(backend, "default") + udf = UdfsCrawler(backend, "default") + crawler = GrantsCrawler(tc, udf) + + results = crawler._crawl() + assert results == [ + Grant( + principal="principal1", + action_type="OWN", + catalog="hive_metastore", + database="test_database", + udf="function_good", + any_file=False, + anonymous_function=False, + ) + ] diff --git a/tests/unit/hive_metastore/test_udfs.py b/tests/unit/hive_metastore/test_udfs.py new file mode 100644 index 0000000000..68043a13b1 --- /dev/null +++ b/tests/unit/hive_metastore/test_udfs.py @@ -0,0 +1,40 @@ +from databricks.labs.ucx.hive_metastore.udfs import Udf, UdfsCrawler + +from ..framework.mocks import MockBackend +from .test_grants import make_row + + +def test_key(): + udf = Udf( + catalog="CATALOG", + database="DB", + name="function", + func_type="_", + func_input="_", + func_returns="_", + deterministic=True, + data_access="", + body="", + ) + assert udf.key == "catalog.db.function" + + +def test_udfs_crawler_inventory_table(): + fc = UdfsCrawler(MockBackend(), "default") + assert fc._table == "udfs" + + +def test_udfs_returning_error_when_describing(): + errors = {"DESCRIBE FUNCTION EXTENDED hive_metastore.database.function1": "error"} + rows = { + "SHOW DATABASES": [ + make_row(("database",), ["databaseName"]), + ], + "SHOW USER FUNCTIONS FROM hive_metastore.database": [ + make_row(("hive_metastore.database.function1",), ["function"]), + ], + } + backend = MockBackend(fails_on_first=errors, rows=rows) + fc = UdfsCrawler(backend, "default") + results = fc._crawl() + assert len(results) == 0 diff --git a/tests/unit/workspace_access/test_manager.py b/tests/unit/workspace_access/test_manager.py index 0a24a1ed4e..024771ed1d 100644 --- a/tests/unit/workspace_access/test_manager.py +++ b/tests/unit/workspace_access/test_manager.py @@ -192,30 +192,34 @@ def test_factory(mocker): b = MockBackend() permission_manager = PermissionManager.factory(ws, b, "test") appliers = permission_manager._appliers() - assert { - "sql/warehouses", - "registered-models", - "instance-pools", - "jobs", - "directories", - "experiments", - "clusters", - "notebooks", - "repos", - "files", - "authorization", - "pipelines", - "cluster-policies", - "dashboards", - "queries", - "alerts", - "secrets", - "entitlements", - "roles", - "ANONYMOUS FUNCTION", - "CATALOG", - "TABLE", - "ANY FILE", - "VIEW", - "DATABASE", - } == appliers.keys() + + assert sorted( + { + "sql/warehouses", + "registered-models", + "instance-pools", + "jobs", + "directories", + "experiments", + "clusters", + "notebooks", + "repos", + "files", + "authorization", + "pipelines", + "cluster-policies", + "dashboards", + "queries", + "alerts", + "secrets", + "entitlements", + "roles", + "ANY FILE", + "FUNCTION", + "ANONYMOUS FUNCTION", + "CATALOG", + "TABLE", + "VIEW", + "DATABASE", + } + ) == sorted(appliers.keys()) diff --git a/tests/unit/workspace_access/test_tacl.py b/tests/unit/workspace_access/test_tacl.py index 8fc11222f3..f521096d0f 100644 --- a/tests/unit/workspace_access/test_tacl.py +++ b/tests/unit/workspace_access/test_tacl.py @@ -2,6 +2,7 @@ from databricks.labs.ucx.hive_metastore import GrantsCrawler, TablesCrawler from databricks.labs.ucx.hive_metastore.grants import Grant +from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler from databricks.labs.ucx.workspace_access.base import Permissions from databricks.labs.ucx.workspace_access.groups import MigratedGroup, MigrationState from databricks.labs.ucx.workspace_access.tacl import TableAclSupport @@ -13,12 +14,13 @@ def test_tacl_crawler(): sql_backend = MockBackend( rows={ "SELECT \\* FROM hive_metastore.test.grants": [ - ("foo@example.com", "SELECT", "catalog_a", "database_b", "table_c", None, False, False) + ("foo@example.com", "SELECT", "catalog_a", "database_b", "table_c", None, None, False, False) ] } ) tables_crawler = TablesCrawler(sql_backend, "test") - grants_crawler = GrantsCrawler(tables_crawler) + udf_crawler = UdfsCrawler(sql_backend, "test") + grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) table_acl_support = TableAclSupport(grants_crawler, sql_backend) crawler_tasks = table_acl_support.get_crawler_tasks() @@ -29,34 +31,58 @@ def test_tacl_crawler(): assert "catalog_a.database_b.table_c" == x.object_id +def test_tacl_udf_crawler(): + sql_backend = MockBackend( + rows={ + "SELECT \\* FROM hive_metastore.test.grants": [ + ("foo@example.com", "READ_METADATA", "catalog_a", "database_b", None, None, "function_c", False, False) + ] + } + ) + tables_crawler = TablesCrawler(sql_backend, "test") + udf_crawler = UdfsCrawler(sql_backend, "test") + grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) + table_acl_support = TableAclSupport(grants_crawler, sql_backend) + + crawler_tasks = table_acl_support.get_crawler_tasks() + first_task = next(crawler_tasks) + x = first_task() + + assert "FUNCTION" == x.object_type + assert "catalog_a.database_b.function_c" == x.object_id + + def test_tacl_crawler_multiple_permissions(): sql_backend = MockBackend( rows={ "SELECT \\* FROM hive_metastore.test.grants": [ - ("foo@example.com", "SELECT", "catalog_a", "database_b", "table_c", None, False, False), - ("foo@example.com", "MODIFY", "catalog_a", "database_b", "table_c", None, False, False), - ("foo@example.com", "OWN", "catalog_a", "database_b", "table_c", None, False, False), + ("foo@example.com", "SELECT", "catalog_a", "database_b", "table_c", None, None, False, False), + ("foo@example.com", "MODIFY", "catalog_a", "database_b", "table_c", None, None, False, False), + ("foo@example.com", "OWN", "catalog_a", "database_b", "table_c", None, None, False, False), # different table name (object_id) - ("foo@example.com", "SELECT", "catalog_a", "database_b", "table_d", None, False, False), + ("foo@example.com", "SELECT", "catalog_a", "database_b", "table_d", None, None, False, False), # different principal - ("foo2@example.com", "SELECT", "catalog_a", "database_b", "table_c", None, False, False), + ("foo2@example.com", "SELECT", "catalog_a", "database_b", "table_c", None, None, False, False), # duplicate - ("foo2@example.com", "SELECT", "catalog_a", "database_b", "table_c", None, False, False), + ("foo2@example.com", "SELECT", "catalog_a", "database_b", "table_c", None, None, False, False), # view - ("foo3@example.com", "SELECT", "catalog_a", "database_b", None, "view_c", False, False), + ("foo3@example.com", "SELECT", "catalog_a", "database_b", None, "view_c", None, False, False), # database - ("foo3@example.com", "SELECT", "catalog_a", "database_b", None, None, False, False), + ("foo3@example.com", "SELECT", "catalog_a", "database_b", None, None, None, False, False), # catalog - ("foo3@example.com", "SELECT", "catalog_a", None, None, None, False, False), + ("foo3@example.com", "SELECT", "catalog_a", None, None, None, None, False, False), # any file - ("foo3@example.com", "SELECT", None, None, None, None, True, False), - # function - ("foo3@example.com", "SELECT", None, None, None, None, False, True), + ("foo3@example.com", "SELECT", None, None, None, None, None, True, False), + # anonymous function + ("foo3@example.com", "SELECT", None, None, None, None, None, False, True), + # udf (user defined function) + ("foo3@example.com", "SELECT", "catalog_a", "database_b", None, None, "function_c", False, False), ] } ) tables_crawler = TablesCrawler(sql_backend, "test") - grants_crawler = GrantsCrawler(tables_crawler) + udf_crawler = UdfsCrawler(sql_backend, "test") + grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) table_acl_support = TableAclSupport(grants_crawler, sql_backend) crawler_tasks = table_acl_support.get_crawler_tasks() @@ -72,6 +98,7 @@ def test_tacl_crawler_multiple_permissions(): database="database_b", table="table_c", view=None, + udf=None, any_file=False, anonymous_function=False, ) == Grant(**json.loads(permissions.raw)) @@ -87,6 +114,7 @@ def test_tacl_crawler_multiple_permissions(): database="database_b", table="table_d", view=None, + udf=None, any_file=False, anonymous_function=False, ) == Grant(**json.loads(permissions.raw)) @@ -102,6 +130,7 @@ def test_tacl_crawler_multiple_permissions(): database="database_b", table="table_c", view=None, + udf=None, any_file=False, anonymous_function=False, ) == Grant(**json.loads(permissions.raw)) @@ -117,6 +146,7 @@ def test_tacl_crawler_multiple_permissions(): database="database_b", table=None, view="view_c", + udf=None, any_file=False, anonymous_function=False, ) == Grant(**json.loads(permissions.raw)) @@ -132,6 +162,7 @@ def test_tacl_crawler_multiple_permissions(): database="database_b", table=None, view=None, + udf=None, any_file=False, anonymous_function=False, ) == Grant(**json.loads(permissions.raw)) @@ -147,6 +178,7 @@ def test_tacl_crawler_multiple_permissions(): database=None, table=None, view=None, + udf=None, any_file=False, anonymous_function=False, ) == Grant(**json.loads(permissions.raw)) @@ -162,6 +194,7 @@ def test_tacl_crawler_multiple_permissions(): database=None, table=None, view=None, + udf=None, any_file=True, anonymous_function=False, ) == Grant(**json.loads(permissions.raw)) @@ -177,10 +210,27 @@ def test_tacl_crawler_multiple_permissions(): database=None, table=None, view=None, + udf=None, any_file=False, anonymous_function=True, ) == Grant(**json.loads(permissions.raw)) + permissions = next(crawler_tasks)() + + assert "FUNCTION" == permissions.object_type + assert "catalog_a.database_b.function_c" == permissions.object_id + assert Grant( + principal="foo3@example.com", + action_type="SELECT", + catalog="catalog_a", + database="database_b", + table=None, + view=None, + udf="function_c", + any_file=False, + anonymous_function=False, + ) == Grant(**json.loads(permissions.raw)) + def test_tacl_applier(mocker): sql_backend = MockBackend() @@ -218,6 +268,42 @@ def test_tacl_applier(mocker): assert ["GRANT SELECT ON TABLE catalog_a.database_b.table_c TO `account-abc`"] == sql_backend.queries +def test_tacl_udf_applier(mocker): + sql_backend = MockBackend() + table_acl_support = TableAclSupport(mocker.Mock(), sql_backend) + + permissions = Permissions( + object_type="FUNCTION", + object_id="catalog_a.database_b.function_c", + raw=json.dumps( + { + "principal": "abc", + "action_type": "SELECT", + "catalog": "catalog_a", + "database": "database_b", + "udf": "function_c", + } + ), + ) + grp = [ + MigratedGroup( + id_in_workspace=None, + name_in_workspace="abc", + name_in_account="account-abc", + temporary_name="tmp-backup-abc", + members=None, + entitlements=None, + external_id=None, + roles=None, + ) + ] + migration_state = MigrationState(grp) + task = table_acl_support.get_apply_task(permissions, migration_state) + task() + + assert ["GRANT SELECT ON FUNCTION catalog_a.database_b.function_c TO `account-abc`"] == sql_backend.queries + + def test_tacl_applier_multiple_actions(mocker): sql_backend = MockBackend() table_acl_support = TableAclSupport(mocker.Mock(), sql_backend)