databrickslabs · nfx · May 23, 2024 · Apr 2, 2024 · Apr 2, 2024 · Apr 2, 2024
@@ -413,6 +413,7 @@ See more details in [Table migration commands](#table-migration-commands)
 ### Table Migration Workflow Tasks
 - `migrate_dbfs_root_delta_tables` - Migrate delta tables from the DBFS root using deep clone, along with legacy table ACL migrated if any.
 - `migrate_external_tables_sync` - Migrate external tables using [`SYNC`](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-aux-sync.html) command, along with legacy table ACL migrated if any.
+- `migrate_tables_in_mounts_experimental` - An experimental task that migrates tables in mount points using a `CREATE TABLE` command, optinally sets a default tables owner if provided in `default_table_owner` conf parameter. You must run the `scan_tables_in_mounts_experimental` workflow before running this task, otherwise it will do nothing.
 - Following workflows/tasks are on the roadmap and being developed:
   - Migrate view
   - Migrate tables using CTAS
@@ -427,6 +428,20 @@ See more details in [Table migration commands](#table-migration-commands)
 - Consider create an instance pool, and set the instance pool id when being asked during the UCX installation. This instance pool will be put into the cluster policy used by all UCX workflows job clusters.
 - You may also manually edit the job cluster configration per job or per task after the workflows are deployed.
 
+### [EXPERIMENTAL] Scan tables in mounts Workflow
+- This experimental workflow attemps to find all Tables inside mount points that are present on your workspace.
+- If you do not run this workflow, then `migrate_tables_in_mounts_experimental` won't do anything.
+- It writes all results to `hive_metastore.<inventory_database>.tables`, you can query those tables found by filtering on database values that starts with `mounted_`
+- This command is incremental, meaning that each time you run it, it will overwrite the previous tables in mounts found.
+- Current format are supported:
+  - DELTA - PARQUET - CSV - JSONS
+  - Also detects partitioned DELTA and PARQUET
+- You can configure these workflows with the following options available on conf.yml:
+  - include_mounts : A list of mount points to scans, by default the workflow scans for all mount points
+  - exclude_paths_in_mount : A list of paths to exclude in all mount points
+  - include_paths_in_mount : A list of paths to include in all mount points 
+
+
 [[back to top](#databricks-labs-ucx)]
 
 # Utility commands

diff --git a/src/databricks/labs/ucx/config.py b/src/databricks/labs/ucx/config.py
@@ -53,6 +53,9 @@ class WorkspaceConfig:  # pylint: disable=too-many-instance-attributes
     exclude_paths_in_mount: list[str] | None = None
     include_paths_in_mount: list[str] | None = None
 
+    # Whether we have to set a specific owner to tables created by TablesInMounts
+    default_table_owner: str | None = None
+
     def replace_inventory_variable(self, text: str) -> str:
         return text.replace("$inventory", f"hive_metastore.{self.inventory_database}")
 

diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py
@@ -209,6 +209,7 @@ def tables_migrator(self):
             self.group_manager,
             self.migration_status_refresher,
             self.principal_acl,
+            self.config.default_table_owner,
         )
 
     @cached_property

diff --git a/src/databricks/labs/ucx/contexts/workflow_task.py b/src/databricks/labs/ucx/contexts/workflow_task.py
@@ -89,4 +89,5 @@ def tables_in_mounts(self):
             self.mounts_crawler,
             self.config.include_mounts,
             self.config.exclude_paths_in_mount,
+            self.config.include_paths_in_mount,
         )
diff --git a/src/databricks/labs/ucx/hive_metastore/locations.py b/src/databricks/labs/ucx/hive_metastore/locations.py
@@ -279,6 +279,7 @@ def __init__(
         self._include_mounts = include_mounts
         self._ws = ws
         self._include_paths_in_mount = include_paths_in_mount
+        self._seen_tables: dict[str, str] = {}
 
         irrelevant_patterns = {'_SUCCESS', '_committed_', '_started_'}
         if exclude_paths_in_mount:
@@ -293,6 +294,7 @@ def _snapshot(self, fetcher: ResultFn, loader: ResultFn) -> list[Result]:
         cached_results = []
         try:
             cached_results = list(fetcher())
+            self._init_seen_tables(cached_results)
         except NotFound:
             pass
         logger.debug(f"[{self.full_name}] crawling new batch for {self._table}")
@@ -302,6 +304,12 @@ def _snapshot(self, fetcher: ResultFn, loader: ResultFn) -> list[Result]:
         self._append_records(loaded_records)
         return loaded_records
 
+    def _init_seen_tables(self, loaded_records: Iterable[Table]):
+        for rec in loaded_records:
+            if not rec.location:
+                continue
+            self._seen_tables[rec.location] = rec.key
+
     def _append_records(self, items: Sequence[Table]):
         logger.debug(f"[{self.full_name}] found {len(items)} new records for {self._table}")
         self._backend.save_table(self.full_name, items, Table, mode="overwrite")
@@ -329,18 +337,41 @@ def _crawl(self):
 
             for path, entry in table_paths.items():
                 guess_table = os.path.basename(path)
+                table_location = self._get_table_location(mount, path)
+                if table_location in self._seen_tables:
+                    logger.info(
+                        f"Path {table_location} is identified as a table in mount, but is present in current workspace as a registered table {self._seen_tables[table_location]}"
+                    )
+                    continue
+                if path in self._seen_tables:
+                    logger.info(
+                        f"Path {table_location} is identified as a table in mount, but is present in current workspace as a registered table {self._seen_tables[path]}"
+                    )
+                    continue
                 table = Table(
                     catalog="hive_metastore",
                     database=f"{self.TABLE_IN_MOUNT_DB}{mount.name.replace('/mnt/', '').replace('/', '_')}",
                     name=guess_table,
                     object_type="EXTERNAL",
                     table_format=entry.format,
-                    location=path.replace(f"dbfs:{mount.name}/", mount.source),
+                    location=table_location,
                     is_partitioned=entry.is_partitioned,
                 )
                 all_tables.append(table)
+        logger.info(f"Found a total of {len(all_tables)} tables in mount points")
         return all_tables
 
+    def _get_table_location(self, mount: Mount, path: str):
+        """
+        There can be different cases for mounts:
+            - Mount(name='/mnt/things/a', source='abfss://things@labsazurethings.dfs.core.windows.net/a')
+            - Mount(name='/mnt/mount' source='abfss://container@dsss.net/')
+            Both must return the complete source with a forward slash in the end
+        """
+        if mount.source.endswith("/"):
+            return path.replace(f"dbfs:{mount.name}/", mount.source)
+        return path.replace(f"dbfs:{mount.name}", mount.source)
+
     def _find_delta_log_folders(self, root_dir: str, delta_log_folders=None) -> dict:
         if delta_log_folders is None:
             delta_log_folders = {}
@@ -371,7 +402,8 @@ def _find_delta_log_folders(self, root_dir: str, delta_log_folders=None) -> dict
     def _assess_path(
         self, file_info: FileInfo, delta_log_folders: dict[str, Table], root_path: str
     ) -> TableInMount | None:
-        if file_info.name == "_delta_log/":
+        # Depends of execution runtime, with SDK, dbutils.fs.list returns _delta_log, a cluster will return _delta_log/
+        if file_info.name in {"_delta_log/", "_delta_log"}:
             logger.debug(f"Found delta table {root_path}")
             if not delta_log_folders.get(root_path):
                 return TableInMount(format="DELTA", is_partitioned=False)

@@ -146,6 +146,8 @@ def _get_databases_in_scope(self, databases: set[str]):
         return Threads.strict("checking databases for skip property", tasks)
 
     def _get_database_in_scope_task(self, database: str) -> str | None:
+        if database.startswith("mounted_"):
+            return database
         describe = {}
         try:
             for value in self._sql_backend.fetch(f"DESCRIBE SCHEMA EXTENDED {escape_sql_identifier(database)}"):
@@ -170,9 +172,7 @@ def _get_table_in_scope_task(self, table_to_migrate: TableToMigrate) -> TableToM
         if self.exists_in_uc(table, rule.as_uc_table_key):
             logger.info(f"The intended target for {table.key}, {rule.as_uc_table_key}, already exists.")
             return None
-        result = self._sql_backend.fetch(
-            f"SHOW TBLPROPERTIES {escape_sql_identifier(table.database)}.{escape_sql_identifier(table.name)}"
-        )
+        result = self._get_table_properties(table)
         for value in result:
             if value["key"] == self.UCX_SKIP_PROPERTY:
                 logger.info(f"{table.key} is marked to be skipped")
@@ -191,20 +191,41 @@ def _get_table_in_scope_task(self, table_to_migrate: TableToMigrate) -> TableToM
 
         return table_to_migrate
 
+    def _get_table_properties(self, table):
+        if table.is_table_in_mount:
+            return self._sql_backend.fetch(f"SHOW TBLPROPERTIES delta.`{table.location}`")
+        return self._sql_backend.fetch(
+            f"SHOW TBLPROPERTIES {escape_sql_identifier(table.database)}.{escape_sql_identifier(table.name)}"
+        )
+
     def exists_in_uc(self, src_table: Table, target_key: str):
         # Attempts to get the target table info from UC returns True if it exists.
-        try:
-            table_info = self._ws.tables.get(target_key)
-            if not table_info.properties:
-                return True
-            upgraded_from = table_info.properties.get("upgraded_from")
-            if upgraded_from and upgraded_from != src_table.key:
-                raise ResourceConflict(
-                    f"Expected to be migrated from {src_table.key}, but got {upgraded_from}. "
-                    "You can skip this error using the CLI command: "
-                    "databricks labs ucx skip "
-                    f"--schema {src_table.database} --table {src_table.name}"
-                )
+        table_info = self._try_get_table_in_uc(target_key)
+        if not table_info:
+            return False
+        # Corner case for tables in mounts where the table exists in UC, but the location is not the same
+        # from the one provided in the mapping
+        if src_table.is_table_in_mount and table_info.storage_location != src_table.location:
+            raise ResourceConflict(
+                f"Expected to be migrated from {src_table.key}, but got {table_info.storage_location}. "
+                "You can skip this error using the CLI command: "
+                "databricks labs ucx skip "
+                f"--schema {src_table.database} --table {src_table.name}"
+            )
+        if not table_info.properties:
             return True
+        upgraded_from = table_info.properties.get("upgraded_from")
+        if upgraded_from and upgraded_from != src_table.key and not src_table.is_table_in_mount:
+            raise ResourceConflict(
+                f"Expected to be migrated from {src_table.key}, but got {upgraded_from}. "
+                "You can skip this error using the CLI command: "
+                "databricks labs ucx skip "
+                f"--schema {src_table.database} --table {src_table.name}"
+            )
+        return True
+
+    def _try_get_table_in_uc(self, target_key: str):
+        try:
+            return self._ws.tables.get(target_key)
         except NotFound:
-            return False
+            return None
@@ -49,6 +49,7 @@ def destination(self):
 
 
 class TablesMigrator:
+    # pylint: disable-next=too-many-arguments
     def __init__(
         self,
         table_crawler: TablesCrawler,
@@ -59,6 +60,7 @@ def __init__(
         group_manager: GroupManager,
         migration_status_refresher: 'MigrationStatusRefresher',
         principal_grants: PrincipalACL,
+        default_table_owner: str | None = None,
     ):
         self._tc = table_crawler
         self._gc = grant_crawler
@@ -69,6 +71,7 @@ def __init__(
         self._migration_status_refresher = migration_status_refresher
         self._seen_tables: dict[str, str] = {}
         self._principal_grants = principal_grants
+        self._default_table_owner = default_table_owner
 
     def index(self):
         # TODO: remove this method
@@ -142,6 +145,9 @@ def _compute_grants(
             grants.extend(self._match_grants(table, all_grants_to_migrate, all_migrated_groups))
         if AclMigrationWhat.PRINCIPAL in acl_strategy:
             grants.extend(self._match_grants(table, all_principal_grants, all_migrated_groups))
+        if AclMigrationWhat.DEFAULT_TABLE_OWNER in acl_strategy:
+            if self._default_table_owner:
+                grants.append(Grant(self._default_table_owner, "OWN", table.catalog, table.database, table.name))
         return grants
 
     def _migrate_table(
@@ -156,6 +162,8 @@ def _migrate_table(
             return self._migrate_dbfs_root_table(src_table.src, src_table.rule, grants)
         if src_table.src.what == What.EXTERNAL_SYNC:
             return self._migrate_external_table(src_table.src, src_table.rule, grants)
+        if src_table.src.what == What.TABLE_IN_MOUNT:
+            return self._migrate_table_in_mount(src_table.src, src_table.rule, grants)
         logger.info(f"Table {src_table.src.key} is not supported for migration")
         return True
 
@@ -207,6 +215,20 @@ def _migrate_dbfs_root_table(self, src_table: Table, rule: Rule, grants: list[Gr
         self._backend.execute(src_table.sql_alter_from(rule.as_uc_table_key, self._ws.get_workspace_id()))
         return self._migrate_acl(src_table, rule, grants)
 
+    def _migrate_table_in_mount(self, src_table: Table, rule: Rule, grants: list[Grant] | None = None):
+        target_table_key = rule.as_uc_table_key
+        fields = []
+        for key, value, _ in self._backend.fetch(f"DESCRIBE TABLE delta.`{src_table.location}`;"):
+            fields.append(f"{key} {value}")
+        schema = ", ".join(fields)
+        table_migrate_sql = src_table.sql_migrate_table_in_mount(target_table_key, schema)
+        logger.info(
+            f"Migrating table in mount {src_table.location} to UC table {rule.as_uc_table_key} using SQL query: {table_migrate_sql}"
+        )
+        self._backend.execute(table_migrate_sql)
+        self._backend.execute(src_table.sql_table_in_mount_alter_from(rule.as_uc_table_key))
+        return self._migrate_acl(src_table, rule, grants)
+
     def _migrate_view_table(self, src_table: Table, rule: Rule, grants: list[Grant] | None = None):
         target_table_key = rule.as_uc_table_key
         table_migrate_sql = src_table.sql_migrate_view(target_table_key)

@@ -22,13 +22,15 @@ class What(Enum):
     DBFS_ROOT_DELTA = auto()
     DBFS_ROOT_NON_DELTA = auto()
     VIEW = auto()
+    TABLE_IN_MOUNT = auto()
     DB_DATASET = auto()
     UNKNOWN = auto()
 
 
 class AclMigrationWhat(Enum):
     LEGACY_TACL = auto()
     PRINCIPAL = auto()
+    DEFAULT_TABLE_OWNER = auto()
 
 
 @dataclass
@@ -72,6 +74,8 @@ def is_delta(self) -> bool:
 
     @property
     def key(self) -> str:
+        if self.is_table_in_mount:
+            return f"{self.catalog}.{self.database}.{self.location}".lower()
         return f"{self.catalog}.{self.database}.{self.name}".lower()
 
     def __hash__(self):
@@ -94,6 +98,12 @@ def sql_alter_from(self, target_table_key, ws_id):
             f" , '{self.UPGRADED_FROM_WS_PARAM}' = '{ws_id}');"
         )
 
+    def sql_table_in_mount_alter_from(self, target_table_key: str) -> str:
+        return (
+            f"ALTER {self.kind} {escape_sql_identifier(target_table_key)} SET TBLPROPERTIES "
+            f"('upgraded_from' = '{self.location}');"
+        )
+
     def sql_unset_upgraded_to(self):
         return f"ALTER {self.kind} {escape_sql_identifier(self.key)} UNSET TBLPROPERTIES IF EXISTS('upgraded_to');"
 
@@ -128,10 +138,16 @@ def is_databricks_dataset(self) -> bool:
                 return True
         return False
 
+    @property
+    def is_table_in_mount(self) -> bool:
+        return self.database.startswith("mounted_")
+
     @property
     def what(self) -> What:
         if self.is_databricks_dataset:
             return What.DB_DATASET
+        if self.is_table_in_mount:
+            return What.TABLE_IN_MOUNT
         if self.is_dbfs_root and self.table_format == "DELTA":
             return What.DBFS_ROOT_DELTA
         if self.is_dbfs_root:
@@ -156,6 +172,9 @@ def sql_migrate_dbfs(self, target_table_key):
     def sql_migrate_view(self, target_table_key):
         return f"CREATE VIEW IF NOT EXISTS {escape_sql_identifier(target_table_key)} AS {self.view_text};"
 
+    def sql_migrate_table_in_mount(self, target_table_key, schema):
+        return f"CREATE TABLE IF NOT EXISTS {escape_sql_identifier(target_table_key)} ({schema}) LOCATION '{(self.location)}';"
+
 
 @dataclass
 class TableError:

@@ -27,13 +27,25 @@ def migrate_dbfs_root_delta_tables(self, ctx: RuntimeContext):
         ctx.tables_migrator.migrate_tables(what=What.DBFS_ROOT_DELTA, acl_strategy=[AclMigrationWhat.LEGACY_TACL])
 
 
-class MigrateTablesInMounts(Workflow):
+class ScanTablesInMounts(Workflow):
     def __init__(self):
-        super().__init__('migrate-tables-in-mounts-experimental')
+        super().__init__('scan-tables-in-mounts-experimental')
 
     @job_task
     def scan_tables_in_mounts_experimental(self, ctx: RuntimeContext):
         """[EXPERIMENTAL] This workflow scans for Delta tables inside all mount points
         captured during the assessment. It will store the results under the `tables` table
         located under the assessment."""
         ctx.tables_in_mounts.snapshot()
+
+
+class MigrateTablesInMounts(Workflow):
+    def __init__(self):
+        super().__init__('migrate-tables-in-mounts-experimental')
+
+    @job_task(job_cluster="table_migration", depends_on=[ScanTablesInMounts.scan_tables_in_mounts_experimental])
+    def migrate_tables_in_mounts_experimental(self, ctx: RuntimeContext):
+        """[EXPERIMENTAL] This workflow migrates `delta tables stored in mount points` to Unity Catalog using a Create Table statement."""
+        ctx.tables_migrator.migrate_tables(
+            what=What.TABLE_IN_MOUNT, acl_strategy=[AclMigrationWhat.DEFAULT_TABLE_OWNER]
+        )
@@ -955,6 +955,7 @@ def create(*, catalog_name: str = "hive_metastore", name: str | None = None) ->
 @pytest.fixture
 # pylint: disable-next=too-many-statements
 def make_table(ws, sql_backend, make_schema, make_random) -> Generator[Callable[..., TableInfo], None, None]:
+    # pylint: disable-next=too-many-arguments
     def create(
         *,
         catalog_name="hive_metastore",
@@ -964,6 +965,7 @@ def create(
         non_delta: bool = False,
         external: bool = False,
         external_csv: str | None = None,
+        external_delta: str | None = None,
         view: bool = False,
         tbl_properties: dict[str, str] | None = None,
     ) -> TableInfo:
@@ -993,6 +995,11 @@ def create(
             data_source_format = DataSourceFormat.CSV
             storage_location = external_csv
             ddl = f"{ddl} USING CSV OPTIONS (header=true) LOCATION '{storage_location}'"
+        elif external_delta is not None:
+            table_type = TableType.EXTERNAL
+            data_source_format = DataSourceFormat.DELTA
+            storage_location = external_delta
+            ddl = f"{ddl} (id string) LOCATION '{storage_location}'"
         elif external:
             # external table
             table_type = TableType.EXTERNAL