diff --git a/.github/codecov.yml b/.github/codecov.yml deleted file mode 100644 index aa8cf6b8d7..0000000000 --- a/.github/codecov.yml +++ /dev/null @@ -1,10 +0,0 @@ -coverage: - status: - project: - default: - target: auto - threshold: 0.5% - patch: - default: - target: auto - threshold: 0.5% \ No newline at end of file diff --git a/.github/workflows/acceptance.yml b/.github/workflows/acceptance.yml index 407cdb2d6c..ffb711c2f1 100644 --- a/.github/workflows/acceptance.yml +++ b/.github/workflows/acceptance.yml @@ -40,8 +40,7 @@ jobs: run: pip install hatch==1.9.4 - name: Run integration tests - # ... - uses: databrickslabs/sandbox/acceptance@acceptance/v0.2.2 + uses: databrickslabs/sandbox/acceptance@acceptance/v0.2.1 with: vault_uri: ${{ secrets.VAULT_URI }} timeout: 45m diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 8b4b290197..78e307535e 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -35,7 +35,7 @@ jobs: run: pip install hatch==1.9.4 - name: Run nightly tests - uses: databrickslabs/sandbox/acceptance@acceptance/v0.2.2 + uses: databrickslabs/sandbox/acceptance@acceptance/v0.2.1 with: vault_uri: ${{ secrets.VAULT_URI }} timeout: 45m diff --git a/README.md b/README.md index d90b6f216d..1112b850b5 100644 --- a/README.md +++ b/README.md @@ -38,9 +38,7 @@ See [contributing instructions](CONTRIBUTING.md) to help improve this project. * [Table Migration Workflow](#table-migration-workflow) * [Dependency CLI commands](#dependency-cli-commands) * [Table Migration Workflow Tasks](#table-migration-workflow-tasks) - * [Post Migration Data Reconciliation Task](#post-migration-data-reconciliation-task) * [Other considerations](#other-considerations) - * [Jobs Static Code Analysis Workflow](#jobs-static-code-analysis-workflow) * [Utility commands](#utility-commands) * [`logs` command](#logs-command) * [`ensure-assessment-run` command](#ensure-assessment-run-command) @@ -68,7 +66,6 @@ See [contributing instructions](CONTRIBUTING.md) to help improve this project. * [`move` command](#move-command) * [`alias` command](#alias-command) * [Code migration commands](#code-migration-commands) - * [`lint-local-code` command](#lint-local-code-command) * [`migrate-local-code` command](#migrate-local-code-command) * [`migrate-dbsql-dashboards` command](#migrate-dbsql-dashboards-command) * [`revert-dbsql-dashboards` command](#revert-dbsql-dashboards-command) @@ -87,7 +84,6 @@ See [contributing instructions](CONTRIBUTING.md) to help improve this project. - Databricks CLI v0.213 or later. See [instructions](#authenticate-databricks-cli). - Python 3.10 or later. See [Windows](https://www.python.org/downloads/windows/) instructions. -- Databricks Premium or Enterprise workspace. - Network access to your Databricks Workspace used for the [installation process](#install-ucx). - Network access to the Internet for [pypi.org](https://pypi.org) and [github.com](https://github.com) from machine running the installation. - Databricks Workspace Administrator privileges for the user, that runs the installation. Running UCX as a Service Principal is not supported. @@ -462,21 +458,6 @@ There are 3 main table migration workflows, targeting different table types. All - Migrate tables using CTAS - Experimentally migrate Delta and Parquet data found in dbfs mount but not registered as Hive Metastore table into UC tables. -### Post Migration Data Reconciliation Task -UCX also provides `migrate-data-reconciliation` workflow to validate the integrity of the migrated tables: -- Compare the schema of the source and target tables. The result is `schema_matches`, and column by column comparison -is stored as `column_comparison` struct. -- Compare the row counts of the source and target tables. If the row count is within the reconciliation threshold -(defaults to 5%), `data_matches` is True. -- Compare the content of individual row between source and target tables to identify any discrepancies (when `compare_rows` -flag is enabled). This is done using hash comparison, and number of missing rows are stored as `source_missing_count` -and `target_missing_count` - -Once the workflow completes, the output will be stored in `$inventory_database.reconciliation_results` view, and displayed -in the Migration dashboard. - -![reconciliation results](docs/recon_results.png) - ### Other considerations - You may need to run the workflow multiple times to ensure all the tables are migrated successfully in phases. - If your Delta tables in DBFS root have a large number of files, consider: @@ -485,40 +466,6 @@ in the Migration dashboard. - Consider creating an instance pool, and setting its id when prompted during the UCX installation. This instance pool will be specified in the cluster policy used by all UCX workflows job clusters. - You may also manually edit the job cluster configration per job or per task after the workflows are deployed. -### [EXPERIMENTAL] Scan tables in mounts Workflow -#### Always run this workflow AFTER the assessment has finished -- This experimental workflow attemps to find all Tables inside mount points that are present on your workspace. -- If you do not run this workflow, then `migrate-tables-in-mounts-experimental` won't do anything. -- It writes all results to `hive_metastore..tables`, you can query those tables found by filtering on database values that starts with `mounted_` -- This command is incremental, meaning that each time you run it, it will overwrite the previous tables in mounts found. -- Current format are supported: - - DELTA - PARQUET - CSV - JSON - - Also detects partitioned DELTA and PARQUET -- You can configure these workflows with the following options available on conf.yml: - - include_mounts : A list of mount points to scans, by default the workflow scans for all mount points - - exclude_paths_in_mount : A list of paths to exclude in all mount points - - include_paths_in_mount : A list of paths to include in all mount points - -### [EXPERIMENTAL] Migrate tables in mounts Workflow -- An experimental workflow that migrates tables in mount points using a `CREATE TABLE` command, optinally sets a default tables owner if provided in `default_table_owner` conf parameter. -- You must do the following in order to make this work: - - run the Assessment [workflow](#assessment-workflow) - - run the scan tables in mounts [workflow](#EXPERIMENTAL-scan-tables-in-mounts-workflow) - - run the [`create-table-mapping` command](#create-table-mapping-command) - - or manually create a `mapping.csv` file in Workspace -> Applications -> ucx - - -[[back to top](#databricks-labs-ucx)] - -## Jobs Static Code Analysis Workflow - -> Please note that this is an experimental workflow. - -The `experimental-workflow-linter` workflow lints accessible code belonging to all workflows/jobs present in the -workspace. The linting emits problems indicating what to resolve for making the code Unity Catalog compatible. - -![code compatibility problems](docs/code_compatibility_problems.png) - [[back to top](#databricks-labs-ucx)] # Utility commands @@ -974,25 +921,6 @@ clusters to be UC compatible. [[back to top](#databricks-labs-ucx)] -## `lint-local-code` command - -```text -databricks labs ucx lint-local-code -``` - -At any time, you can run this command to assess all migrations required in a local directory or a file. It only takes seconds to run and it -gives you an initial overview of what needs to be migrated without actually performing any migration. A great way to start a migration! - -This command detects all dependencies, and analyzes them. It is still experimental and at the moment only supports Python and SQL files. -We expect this command to run within a minute on code bases up to 50.000 lines of code. -Future versions of `ucx` will add support for more source types, and more migration details. - -When run from an IDE terminal, this command generates output as follows: -![img.png](docs/lint-local-code-output.png) -With modern IDEs, clicking on the file link opens the file at the problematic line - -[[back to top](#databricks-labs-ucx)] - ## `migrate-local-code` command ```text diff --git a/docs/code_compatibility_problems.png b/docs/code_compatibility_problems.png deleted file mode 100644 index 323c40199b..0000000000 Binary files a/docs/code_compatibility_problems.png and /dev/null differ diff --git a/docs/lint-local-code-output.png b/docs/lint-local-code-output.png deleted file mode 100644 index 387e16127a..0000000000 Binary files a/docs/lint-local-code-output.png and /dev/null differ diff --git a/docs/recon_results.png b/docs/recon_results.png deleted file mode 100644 index fc23b7c047..0000000000 Binary files a/docs/recon_results.png and /dev/null differ diff --git a/labs.yml b/labs.yml index 49c319849f..11617f30e3 100644 --- a/labs.yml +++ b/labs.yml @@ -123,19 +123,13 @@ commands: - name: create-missing-principals description: For AWS, this command identifies all the S3 locations that are missing a UC compatible role and - creates them. It accepts a number of optional parameters, i.e. KMS Key, Role Name, Policy Name, and whether to - create a single role for all the S3 locations. + creates them. It takes single-role optional parameter. + If set to True, it will create a single role for all the S3 locations. flags: - name: aws-profile description: AWS Profile to use for authentication - - name: kms-key - description: (Optional) KMS Key to be specified for the UC roles. - - name: role-name - description: (Optional) IAM Role name to be specified for the UC roles. (default:UC_ROLE) - - name: policy-name - description: (Optional) IAM policy Name to be specified for the UC roles. (default:UC_POLICY) - name: single-role - description: (Optional) Create a single role for all S3 locations. (default:False) + description: (Optional) Create a single role for all the S3 locations (default:True) - name: create-uber-principal description: For azure cloud, creates a service principal and gives STORAGE BLOB READER access on all the storage account @@ -192,9 +186,6 @@ commands: - name: migrate-local-code description: (Experimental) Migrate files in the current directory to be more compatible with Unity Catalog. - - name: lint-local-code - description: (Experimental) Lint files in the current directory to highlight incompatibilities with Unity Catalog. - - name: show-all-metastores is_account_level: true description: Show all metastores available in the same region as the specified workspace diff --git a/pyproject.toml b/pyproject.toml index 0e2fbf2e01..3cbe04bf73 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,11 +44,11 @@ classifiers = [ "Topic :: Utilities", ] -dependencies = ["databricks-sdk>=0.27,<0.29", +dependencies = ["databricks-sdk~=0.27.0", "databricks-labs-lsql~=0.4.0", "databricks-labs-blueprint>=0.6.0", "PyYAML>=6.0.0,<7.0.0", - "sqlglot>=23.9,<24.1"] + "sqlglot>=23.9,<23.16"] [project.entry-points.databricks] runtime = "databricks.labs.ucx.runtime:main" @@ -87,11 +87,11 @@ path = ".venv" test = "pytest -n 4 --cov src --cov-report=xml --timeout 30 tests/unit --durations 20" coverage = "pytest -n auto --cov src tests/unit --timeout 30 --cov-report=html --durations 20" integration = "pytest -n 10 --cov src tests/integration --durations 20" -fmt = ["black . --extend-exclude 'tests/unit/source_code/samples/*'", +fmt = ["black .", "ruff check . --fix", "mypy --disable-error-code 'annotation-unchecked' --exclude 'tests/unit/source_code/samples/*' .", "pylint --output-format=colorized -j 0 src tests"] -verify = ["black --check . --extend-exclude 'tests/unit/source_code/samples/*'", +verify = ["black --check .", "ruff .", "mypy --exclude 'tests/unit/source_code/samples/*' .", "pylint --output-format=colorized -j 0 src tests"] diff --git a/src/databricks/labs/ucx/assessment/aws.py b/src/databricks/labs/ucx/assessment/aws.py index d00303e1d2..c60831e73f 100644 --- a/src/databricks/labs/ucx/assessment/aws.py +++ b/src/databricks/labs/ucx/assessment/aws.py @@ -330,12 +330,7 @@ def update_uc_trust_role(self, role_name: str, external_id: str = "0000") -> str return update_role["Role"]["Arn"] def put_role_policy( - self, - role_name: str, - policy_name: str, - s3_prefixes: set[str], - account_id: str, - kms_key=None, + self, role_name: str, policy_name: str, s3_prefixes: set[str], account_id: str, kms_key=None ) -> bool: if not self._run_command( f"iam put-role-policy --role-name {role_name} --policy-name {policy_name} " diff --git a/src/databricks/labs/ucx/aws/access.py b/src/databricks/labs/ucx/aws/access.py index 597b1d1128..f24de2c376 100644 --- a/src/databricks/labs/ucx/aws/access.py +++ b/src/databricks/labs/ucx/aws/access.py @@ -33,13 +33,14 @@ def __init__( ws: WorkspaceClient, aws_resources: AWSResources, external_locations: ExternalLocations, + aws_account_id=None, kms_key=None, ): self._installation = installation self._aws_resources = aws_resources self._ws = ws self._locations = external_locations - self._aws_account_id = aws_resources.validate_connection().get("Account") + self._aws_account_id = aws_account_id self._kms_key = kms_key def list_uc_roles(self, *, single_role=True, role_name="UC_ROLE", policy_name="UC_POLICY"): diff --git a/src/databricks/labs/ucx/aws/credentials.py b/src/databricks/labs/ucx/aws/credentials.py index 34d77c60eb..c11bd21987 100644 --- a/src/databricks/labs/ucx/aws/credentials.py +++ b/src/databricks/labs/ucx/aws/credentials.py @@ -205,12 +205,10 @@ def _print_action_plan(iam_list: list[AWSUCRoleCandidate]): def save(self, migration_results: list[CredentialValidationResult]) -> str: return self._installation.save(migration_results, filename=self._output_file) - def run(self, prompts: Prompts, *, single_role=False, role_name="UC_ROLE", policy_name="UC_POLICY"): + def run(self, prompts: Prompts, *, single_role=True, role_name="UC_ROLE", policy_name="UC_POLICY"): iam_list = self._resource_permissions.list_uc_roles( - single_role=single_role, - role_name=role_name, - policy_name=policy_name, + single_role=single_role, role_name=role_name, policy_name=policy_name ) if not iam_list: logger.info("No IAM Role created") diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py index 7f350315a8..8cbdca8cc0 100644 --- a/src/databricks/labs/ucx/cli.py +++ b/src/databricks/labs/ucx/cli.py @@ -11,7 +11,7 @@ from databricks.labs.ucx.config import WorkspaceConfig from databricks.labs.ucx.contexts.account_cli import AccountContext -from databricks.labs.ucx.contexts.workspace_cli import WorkspaceContext, LocalCheckoutContext +from databricks.labs.ucx.contexts.workspace_cli import WorkspaceContext from databricks.labs.ucx.hive_metastore.tables import What ucx = App(__file__) @@ -293,19 +293,17 @@ def create_missing_principals( w: WorkspaceClient, prompts: Prompts, ctx: WorkspaceContext | None = None, - single_role: bool = False, - role_name="UC_ROLE", - policy_name="UC_POLICY", + single_role: bool = True, **named_parameters, ): """Not supported for Azure. For AWS, this command identifies all the S3 locations that are missing a UC compatible role and creates them. - By default, it will create a role per S3 location. Set the optional single_role parameter to True to create a single role for all S3 locations. + By default, it will create a single role for all S3. Set the optional single_role parameter to False, to create one role per S3 location. """ if not ctx: ctx = WorkspaceContext(w, named_parameters) if ctx.is_aws: - return ctx.iam_role_creation.run(prompts, single_role=single_role, role_name=role_name, policy_name=policy_name) + return ctx.iam_role_creation.run(prompts, single_role=single_role) raise ValueError("Unsupported cloud provider") @@ -398,7 +396,7 @@ def revert_cluster_remap(w: WorkspaceClient, prompts: Prompts): @ucx.command def migrate_local_code(w: WorkspaceClient, prompts: Prompts): """Fix the code files based on their language.""" - ctx = LocalCheckoutContext(w) + ctx = WorkspaceContext(w) working_directory = Path.cwd() if not prompts.confirm("Do you want to apply UC migration to all files in the current directory?"): return @@ -471,15 +469,5 @@ def revert_dbsql_dashboards(w: WorkspaceClient, dashboard_id: str | None = None) ctx.redash.revert_dashboards(dashboard_id) -@ucx.command -def lint_local_code( - w: WorkspaceClient, prompts: Prompts, path: str | None = None, ctx: LocalCheckoutContext | None = None -): - """Lint local code files looking for problems.""" - if ctx is None: - ctx = LocalCheckoutContext(w) - ctx.local_code_linter.lint(prompts, None if path is None else Path(path)) - - if __name__ == "__main__": ucx() diff --git a/src/databricks/labs/ucx/config.py b/src/databricks/labs/ucx/config.py index 95da81fbf9..9dee59b7aa 100644 --- a/src/databricks/labs/ucx/config.py +++ b/src/databricks/labs/ucx/config.py @@ -58,8 +58,8 @@ class WorkspaceConfig: # pylint: disable=too-many-instance-attributes # List of workspace ids ucx is installed on, only applied to account-level installation installed_workspace_ids: list[int] | None = None - # Threshold for row count comparison during data reconciliation, in percentage - recon_tolerance_percent: int = 5 + # Whether to upload dependent libraries to the workspace + upload_dependencies: bool = False # Whether to upload dependent libraries to the workspace upload_dependencies: bool = False diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index 3d9898b64b..4f98c18957 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -9,15 +9,8 @@ from databricks.labs.blueprint.tui import Prompts from databricks.labs.blueprint.wheels import ProductInfo, WheelsV2 from databricks.labs.lsql.backends import SqlBackend - -from databricks.labs.ucx.recon.data_comparator import StandardDataComparator -from databricks.labs.ucx.recon.data_profiler import StandardDataProfiler -from databricks.labs.ucx.recon.metadata_retriever import DatabricksTableMetadataRetriever -from databricks.labs.ucx.recon.migration_recon import MigrationRecon -from databricks.labs.ucx.recon.schema_comparator import StandardSchemaComparator from databricks.labs.ucx.source_code.python_libraries import PipResolver from databricks.sdk import AccountClient, WorkspaceClient, core -from databricks.sdk.errors import ResourceDoesNotExist from databricks.sdk.service import sql from databricks.labs.ucx.account.workspaces import WorkspaceInfo @@ -47,10 +40,10 @@ NotebookResolver, NotebookLoader, ) -from databricks.labs.ucx.source_code.files import FileLoader, FolderLoader, ImportFileResolver +from databricks.labs.ucx.source_code.files import FileLoader, LocalFileResolver from databricks.labs.ucx.source_code.path_lookup import PathLookup from databricks.labs.ucx.source_code.graph import DependencyResolver -from databricks.labs.ucx.source_code.whitelist import Whitelist +from databricks.labs.ucx.source_code.whitelist import WhitelistResolver, Whitelist from databricks.labs.ucx.source_code.languages import Languages from databricks.labs.ucx.source_code.redash import Redash from databricks.labs.ucx.workspace_access import generic, redash @@ -287,15 +280,12 @@ def aws_acl(self): @cached_property def principal_locations(self): eligible_locations = {} - try: - if self.is_azure: - eligible_locations = self.azure_acl.get_eligible_locations_principals() - if self.is_aws: - eligible_locations = self.aws_acl.get_eligible_locations_principals() - if self.is_gcp: - raise NotImplementedError("Not implemented for GCP.") - except ResourceDoesNotExist: - pass + if self.is_azure: + eligible_locations = self.azure_acl.get_eligible_locations_principals() + if self.is_aws: + eligible_locations = self.aws_acl.get_eligible_locations_principals() + if self.is_gcp: + raise NotImplementedError("Not implemented for GCP.") return eligible_locations @cached_property @@ -386,23 +376,24 @@ def path_lookup(self): def file_loader(self): return FileLoader() - @cached_property - def folder_loader(self): - return FolderLoader(self.file_loader) - @cached_property def whitelist(self): # TODO: fill in the whitelist return Whitelist() + @cached_property + def whitelist_resolver(self): + return WhitelistResolver(self.whitelist) + @cached_property def file_resolver(self): - return ImportFileResolver(self.file_loader, self.whitelist) + return LocalFileResolver(self.file_loader) @cached_property def dependency_resolver(self): + import_resolvers = [self.file_resolver, self.whitelist_resolver] library_resolvers = [self.pip_resolver] - return DependencyResolver(library_resolvers, self.notebook_resolver, self.file_resolver, self.path_lookup) + return DependencyResolver(library_resolvers, self.notebook_resolver, import_resolvers, self.path_lookup) @cached_property def workflow_linter(self): @@ -421,34 +412,6 @@ def redash(self): self.installation, ) - @cached_property - def metadata_retriever(self): - return DatabricksTableMetadataRetriever(self.sql_backend) - - @cached_property - def schema_comparator(self): - return StandardSchemaComparator(self.metadata_retriever) - - @cached_property - def data_profiler(self): - return StandardDataProfiler(self.sql_backend, self.metadata_retriever) - - @cached_property - def data_comparator(self): - return StandardDataComparator(self.sql_backend, self.data_profiler) - - @cached_property - def migration_recon(self): - return MigrationRecon( - self.sql_backend, - self.inventory_database, - self.migration_status_refresher, - self.table_mapping, - self.schema_comparator, - self.data_comparator, - self.config.recon_tolerance_percent, - ) - class CliContext(GlobalContext, abc.ABC): @cached_property diff --git a/src/databricks/labs/ucx/contexts/workspace_cli.py b/src/databricks/labs/ucx/contexts/workspace_cli.py index 709e3a00d6..b6a7a207e9 100644 --- a/src/databricks/labs/ucx/contexts/workspace_cli.py +++ b/src/databricks/labs/ucx/contexts/workspace_cli.py @@ -8,14 +8,13 @@ from databricks.labs.ucx.assessment.aws import run_command, AWSResources from databricks.labs.ucx.aws.access import AWSResourcePermissions from databricks.labs.ucx.aws.credentials import IamRoleMigration, IamRoleCreation -from databricks.labs.ucx.aws.locations import AWSExternalLocationsMigration from databricks.labs.ucx.azure.access import AzureResourcePermissions from databricks.labs.ucx.azure.credentials import StorageCredentialManager, ServicePrincipalMigration from databricks.labs.ucx.azure.locations import ExternalLocationsMigration +from databricks.labs.ucx.aws.locations import AWSExternalLocationsMigration from databricks.labs.ucx.azure.resources import AzureAPIClient, AzureResources from databricks.labs.ucx.contexts.application import CliContext -from databricks.labs.ucx.source_code.files import LocalFileMigrator, LocalCodeLinter -from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader +from databricks.labs.ucx.source_code.files import LocalFileMigrator from databricks.labs.ucx.workspace_access.clusters import ClusterAccess @@ -32,6 +31,10 @@ def workspace_client(self) -> WorkspaceClient: def sql_backend(self) -> SqlBackend: return StatementExecutionBackend(self.workspace_client, self.config.warehouse_id) + @cached_property + def local_file_migrator(self): + return LocalFileMigrator(self.languages) + @cached_property def cluster_access(self): return ClusterAccess(self.installation, self.workspace_client, self.prompts) @@ -148,6 +151,7 @@ def aws_resource_permissions(self): self.workspace_client, self.aws_resources, self.external_locations, + self.named_parameters.get("aws_account_id"), self.named_parameters.get("kms_key"), ) @@ -166,22 +170,3 @@ def iam_role_creation(self): self.workspace_client, self.aws_resource_permissions, ) - - @cached_property - def notebook_loader(self) -> NotebookLoader: - return NotebookLoader() - - -class LocalCheckoutContext(WorkspaceContext): - """Local context extends Workspace context to provide extra properties - for running local operations.""" - - @cached_property - def local_file_migrator(self): - return LocalFileMigrator(self.languages) - - @cached_property - def local_code_linter(self): - return LocalCodeLinter( - self.file_loader, self.folder_loader, self.path_lookup, self.dependency_resolver, lambda: self.languages - ) diff --git a/src/databricks/labs/ucx/hive_metastore/catalog_schema.py b/src/databricks/labs/ucx/hive_metastore/catalog_schema.py index 27bc0abd84..f3ecba4338 100644 --- a/src/databricks/labs/ucx/hive_metastore/catalog_schema.py +++ b/src/databricks/labs/ucx/hive_metastore/catalog_schema.py @@ -1,7 +1,6 @@ import collections import logging from dataclasses import replace -import fnmatch from pathlib import PurePath from databricks.labs.blueprint.tui import Prompts @@ -138,14 +137,14 @@ def _validate_location(self, location: str): if location == "metastore": return True try: - PurePath(location) + location_path = PurePath(location) except ValueError: logger.error(f"Invalid location path {location}") return False for external_location in self._external_locations: if location == external_location.url: return True - if external_location.url is not None and fnmatch.fnmatch(location, external_location.url + '*'): + if location_path.match(f"{external_location.url}/*"): return True return False diff --git a/src/databricks/labs/ucx/hive_metastore/locations.py b/src/databricks/labs/ucx/hive_metastore/locations.py index 3e59f96fe0..8e340dcce5 100644 --- a/src/databricks/labs/ucx/hive_metastore/locations.py +++ b/src/databricks/labs/ucx/hive_metastore/locations.py @@ -137,30 +137,24 @@ def _try_fetch(self) -> Iterable[ExternalLocation]: ): yield ExternalLocation(*row) - @staticmethod - def _get_ext_location_definitions(missing_locations: list[ExternalLocation]) -> list: + def _get_ext_location_definitions(self, missing_locations: list[ExternalLocation]) -> list: tf_script = [] cnt = 1 - res_name = "" - supported_prefixes = ("s3://", "s3a://", "s3n://", "gcs://", "abfss://") for loc in missing_locations: - for prefix in supported_prefixes: - prefix_len = len(prefix) - if not loc.location.startswith(prefix): - continue - if prefix == "abfss://": - container_sep_loc = loc.location.index("@") - container_name = loc.location[prefix_len:container_sep_loc] - res_name = ( - loc.location[container_sep_loc + 1 :] - .replace(".dfs.core.windows.net", "") - .rstrip("/") - .replace("/", "_") - ) - res_name = f"{container_name}_{res_name}" - else: - res_name = loc.location[prefix_len:].rstrip("/").replace("/", "_") - if res_name == "": + if loc.location.startswith("s3://"): + res_name = loc.location[5:].rstrip("/").replace("/", "_") + elif loc.location.startswith("gcs://"): + res_name = loc.location[6:].rstrip("/").replace("/", "_") + elif loc.location.startswith("abfss://"): + container_name = loc.location[8 : loc.location.index("@")] + res_name = ( + loc.location[loc.location.index("@") + 1 :] + .replace(".dfs.core.windows.net", "") + .rstrip("/") + .replace("/", "_") + ) + res_name = f"{container_name}_{res_name}" + else: # if the cloud storage url doesn't match the above condition or incorrect (example wasb://) # dont generate tf script and ignore logger.warning(f"unsupported storage format {loc.location}") diff --git a/src/databricks/labs/ucx/hive_metastore/mapping.py b/src/databricks/labs/ucx/hive_metastore/mapping.py index a6e8c89de6..5117b1149e 100644 --- a/src/databricks/labs/ucx/hive_metastore/mapping.py +++ b/src/databricks/labs/ucx/hive_metastore/mapping.py @@ -15,7 +15,6 @@ from databricks.labs.ucx.framework.utils import escape_sql_identifier from databricks.labs.ucx.hive_metastore import TablesCrawler from databricks.labs.ucx.hive_metastore.tables import Table -from databricks.labs.ucx.recon.base import TableIdentifier logger = logging.getLogger(__name__) @@ -28,11 +27,9 @@ class Rule: dst_schema: str src_table: str dst_table: str - recon_tolerance_percent: int = 0 # threshold for row count comparison - compare_rows: bool = False # whether to compare row by row @classmethod - def initial(cls, workspace_name: str, catalog_name: str, table: Table, recon_tolerance_percent: int) -> "Rule": + def initial(cls, workspace_name: str, catalog_name: str, table: Table) -> "Rule": return cls( workspace_name=workspace_name, catalog_name=catalog_name, @@ -40,7 +37,6 @@ def initial(cls, workspace_name: str, catalog_name: str, table: Table, recon_tol dst_schema=table.database, src_table=table.name, dst_table=table.name, - recon_tolerance_percent=recon_tolerance_percent, ) @classmethod @@ -52,12 +48,8 @@ def from_src_dst(cls, src_table: TableInfo, dst_schema: SchemaInfo) -> "Rule": dst_schema=str(dst_schema.name or ""), src_table=str(src_table.name or ""), dst_table=str(src_table.name or ""), - recon_tolerance_percent=0, ) - def match(self, table: TableIdentifier) -> bool: - return table.catalog == "hive_metastore" and self.src_schema == table.schema and self.src_table == table.table - @property def as_uc_table_key(self): return f"{self.catalog_name}.{self.dst_schema}.{self.dst_table}" @@ -83,17 +75,10 @@ class TableMapping: FILENAME = 'mapping.csv' UCX_SKIP_PROPERTY = "databricks.labs.ucx.skip" - def __init__( - self, - installation: Installation, - ws: WorkspaceClient, - sql_backend: SqlBackend, - recon_tolerance_percent: int = 0, - ): + def __init__(self, installation: Installation, ws: WorkspaceClient, sql_backend: SqlBackend): self._installation = installation self._ws = ws self._sql_backend = sql_backend - self._recon_tolerance_percent = recon_tolerance_percent def current_tables(self, tables: TablesCrawler, workspace_name: str, catalog_name: str): tables_snapshot = tables.snapshot() @@ -101,7 +86,7 @@ def current_tables(self, tables: TablesCrawler, workspace_name: str, catalog_nam msg = "No tables found. Please run: databricks labs ucx ensure-assessment-run" raise ValueError(msg) for table in tables_snapshot: - yield Rule.initial(workspace_name, catalog_name, table, self._recon_tolerance_percent) + yield Rule.initial(workspace_name, catalog_name, table) def save(self, tables: TablesCrawler, workspace_info: WorkspaceInfo) -> str: workspace_name = workspace_info.current() @@ -173,8 +158,6 @@ def _get_databases_in_scope(self, databases: set[str]): return Threads.strict("checking databases for skip property", tasks) def _get_database_in_scope_task(self, database: str) -> str | None: - if database.startswith("mounted_"): - return database describe = {} try: for value in self._sql_backend.fetch(f"DESCRIBE SCHEMA EXTENDED {escape_sql_identifier(database)}"): @@ -199,11 +182,14 @@ def _get_table_in_scope_task(self, table_to_migrate: TableToMigrate) -> TableToM if self.exists_in_uc(table, rule.as_uc_table_key): logger.info(f"The intended target for {table.key}, {rule.as_uc_table_key}, already exists.") return None - properties = self._get_table_properties(table) - if not properties: + try: + result = self._sql_backend.fetch( + f"SHOW TBLPROPERTIES {escape_sql_identifier(table.database)}.{escape_sql_identifier(table.name)}" + ) + except DatabricksError as err: + logger.warning(f"Failed to get properties for Table {table.key}: {err}") return None - - for value in properties: + for value in result: if value["key"] == self.UCX_SKIP_PROPERTY: logger.info(f"{table.key} is marked to be skipped") return None @@ -224,45 +210,20 @@ def _get_table_in_scope_task(self, table_to_migrate: TableToMigrate) -> TableToM return table_to_migrate - def _get_table_properties(self, table: Table): - table_identifier = f"{escape_sql_identifier(table.database)}.{escape_sql_identifier(table.name)}" - if table.is_table_in_mount: - table_identifier = f"delta.`{table.location}`" - - try: - return self._sql_backend.fetch(f"SHOW TBLPROPERTIES {table_identifier}") - except DatabricksError as err: - logger.warning(f"Failed to get properties for Table {table.key}: {err}") - return None - def exists_in_uc(self, src_table: Table, target_key: str): # Attempts to get the target table info from UC returns True if it exists. - table_info = self._try_get_table_in_uc(target_key) - if not table_info: - return False - # Corner case for tables in mounts where the table exists in UC, but the location is not the same - # from the one provided in the mapping - if src_table.is_table_in_mount and table_info.storage_location != src_table.location: - raise ResourceConflict( - f"Expected to be migrated from {src_table.key}, but got {table_info.storage_location}. " - "You can skip this error using the CLI command: " - "databricks labs ucx skip " - f"--schema {src_table.database} --table {src_table.name}" - ) - if not table_info.properties: - return True - upgraded_from = table_info.properties.get("upgraded_from") - if upgraded_from and upgraded_from != src_table.key and not src_table.is_table_in_mount: - raise ResourceConflict( - f"Expected to be migrated from {src_table.key}, but got {upgraded_from}. " - "You can skip this error using the CLI command: " - "databricks labs ucx skip " - f"--schema {src_table.database} --table {src_table.name}" - ) - return True - - def _try_get_table_in_uc(self, target_key: str): try: - return self._ws.tables.get(target_key) + table_info = self._ws.tables.get(target_key) + if not table_info.properties: + return True + upgraded_from = table_info.properties.get("upgraded_from") + if upgraded_from and upgraded_from != src_table.key: + raise ResourceConflict( + f"Expected to be migrated from {src_table.key}, but got {upgraded_from}. " + "You can skip this error using the CLI command: " + "databricks labs ucx skip " + f"--schema {src_table.database} --table {src_table.name}" + ) + return True except NotFound: - return None + return False diff --git a/src/databricks/labs/ucx/hive_metastore/migration_status.py b/src/databricks/labs/ucx/hive_metastore/migration_status.py index 3fff00c0c8..e5b0bfbe32 100644 --- a/src/databricks/labs/ucx/hive_metastore/migration_status.py +++ b/src/databricks/labs/ucx/hive_metastore/migration_status.py @@ -1,7 +1,7 @@ import datetime import logging from dataclasses import dataclass, replace -from collections.abc import Iterable, KeysView +from collections.abc import Iterable from databricks.labs.lsql.backends import SqlBackend from databricks.sdk import WorkspaceClient @@ -53,9 +53,6 @@ def get(self, schema: str, table: str) -> MigrationStatus | None: return None return dst - def snapshot(self) -> KeysView[tuple[str, str]]: - return self._index.keys() - class MigrationStatusRefresher(CrawlerBase[MigrationStatus]): def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema, table_crawler: TablesCrawler): diff --git a/src/databricks/labs/ucx/hive_metastore/table_migrate.py b/src/databricks/labs/ucx/hive_metastore/table_migrate.py index cfab0dab9f..0cf15bf66b 100644 --- a/src/databricks/labs/ucx/hive_metastore/table_migrate.py +++ b/src/databricks/labs/ucx/hive_metastore/table_migrate.py @@ -6,7 +6,6 @@ from databricks.labs.blueprint.parallel import Threads from databricks.labs.lsql.backends import SqlBackend -from databricks.labs.ucx.framework.utils import escape_sql_identifier from databricks.sdk import WorkspaceClient from databricks.labs.ucx.hive_metastore import TablesCrawler, Mounts @@ -169,8 +168,6 @@ def _migrate_table( return self._migrate_table_create_ctas(src_table.src, src_table.rule, grants, mounts) if src_table.src.what == What.EXTERNAL_SYNC: return self._migrate_external_table(src_table.src, src_table.rule, grants) - if src_table.src.what == What.TABLE_IN_MOUNT: - return self._migrate_table_in_mount(src_table.src, src_table.rule, grants) if src_table.src.what == What.EXTERNAL_HIVESERDE: # This hiveserde_in_place_migrate is used to determine if current hiveserde migration should use in-place migration or CTAS. # We will provide two workflows for hiveserde table migration: @@ -215,9 +212,9 @@ def _migrate_view_table(self, src_view: ViewToMigrate, grants: list[Grant] | Non logger.debug(f"Migrating view {src_view.src.key} to using SQL query: {view_migrate_sql}") try: self._backend.execute(view_migrate_sql) - self._backend.execute(self._sql_alter_to(src_view.src, src_view.rule.as_uc_table_key)) + self._backend.execute(src_view.src.sql_alter_to(src_view.rule.as_uc_table_key)) self._backend.execute( - self._sql_alter_from(src_view.src, src_view.rule.as_uc_table_key, self._ws.get_workspace_id()) + src_view.src.sql_alter_from(src_view.rule.as_uc_table_key, self._ws.get_workspace_id()) ) except DatabricksError as e: logger.warning(f"Failed to migrate view {src_view.src.key} to {src_view.rule.as_uc_table_key}: {e}") @@ -244,7 +241,7 @@ def _migrate_external_table(self, src_table: Table, rule: Rule, grants: list[Gra f"Status code: {sync_result.status_code}. Description: {sync_result.description}" ) return False - self._backend.execute(self._sql_alter_from(src_table, rule.as_uc_table_key, self._ws.get_workspace_id())) + self._backend.execute(src_table.sql_alter_from(rule.as_uc_table_key, self._ws.get_workspace_id())) return self._migrate_acl(src_table, rule, grants) def _migrate_external_table_hiveserde_in_place( @@ -283,8 +280,8 @@ def _migrate_external_table_hiveserde_in_place( ) try: self._backend.execute(table_migrate_sql) - self._backend.execute(self._sql_alter_to(src_table, rule.as_uc_table_key)) - self._backend.execute(self._sql_alter_from(src_table, rule.as_uc_table_key, self._ws.get_workspace_id())) + self._backend.execute(src_table.sql_alter_to(rule.as_uc_table_key)) + self._backend.execute(src_table.sql_alter_from(rule.as_uc_table_key, self._ws.get_workspace_id())) except DatabricksError as e: logger.warning(f"Failed to migrate table {src_table.key} to {rule.as_uc_table_key}: {e}") return False @@ -298,8 +295,8 @@ def _migrate_dbfs_root_table(self, src_table: Table, rule: Rule, grants: list[Gr ) try: self._backend.execute(table_migrate_sql) - self._backend.execute(self._sql_alter_to(src_table, rule.as_uc_table_key)) - self._backend.execute(self._sql_alter_from(src_table, rule.as_uc_table_key, self._ws.get_workspace_id())) + self._backend.execute(src_table.sql_alter_to(rule.as_uc_table_key)) + self._backend.execute(src_table.sql_alter_from(rule.as_uc_table_key, self._ws.get_workspace_id())) except DatabricksError as e: logger.warning(f"Failed to migrate table {src_table.key} to {rule.as_uc_table_key}: {e}") return False @@ -319,23 +316,8 @@ def _migrate_table_create_ctas(self, src_table: Table, rule: Rule, grants: list[ logger.debug(f"Migrating table {src_table.key} to {rule.as_uc_table_key} using SQL query: {table_migrate_sql}") try: self._backend.execute(table_migrate_sql) - self._backend.execute(self._sql_alter_to(src_table, rule.as_uc_table_key)) - self._backend.execute(self._sql_alter_from(src_table, rule.as_uc_table_key, self._ws.get_workspace_id())) - except DatabricksError as e: - logger.warning(f"Failed to migrate table {src_table.key} to {rule.as_uc_table_key}: {e}") - return False - return self._migrate_acl(src_table, rule, grants) - - def _migrate_table_in_mount(self, src_table: Table, rule: Rule, grants: list[Grant] | None = None): - target_table_key = rule.as_uc_table_key - try: - table_schema = self._backend.fetch(f"DESCRIBE TABLE delta.`{src_table.location}`;") - table_migrate_sql = src_table.sql_migrate_table_in_mount(target_table_key, table_schema) - logger.info( - f"Migrating table in mount {src_table.location} to UC table {rule.as_uc_table_key} using SQL query: {table_migrate_sql}" - ) - self._backend.execute(table_migrate_sql) - self._backend.execute(self._sql_alter_from(src_table, rule.as_uc_table_key, self._ws.get_workspace_id())) + self._backend.execute(src_table.sql_alter_to(rule.as_uc_table_key)) + self._backend.execute(src_table.sql_alter_from(rule.as_uc_table_key, self._ws.get_workspace_id())) except DatabricksError as e: logger.warning(f"Failed to migrate table {src_table.key} to {rule.as_uc_table_key}: {e}") return False @@ -478,14 +460,3 @@ def _match_grants(table: Table, grants: Iterable[Grant], migrated_groups: list[M grant = dataclasses.replace(grant, principal=matched_group[0]) matched_grants.append(grant) return matched_grants - - def _sql_alter_to(self, table: Table, target_table_key: str): - return f"ALTER {table.kind} {escape_sql_identifier(table.key)} SET TBLPROPERTIES ('upgraded_to' = '{target_table_key}');" - - def _sql_alter_from(self, table: Table, target_table_key: str, ws_id: int): - source = table.location if table.is_table_in_mount else table.key - return ( - f"ALTER {table.kind} {escape_sql_identifier(target_table_key)} SET TBLPROPERTIES " - f"('upgraded_from' = '{source}'" - f" , '{table.UPGRADED_FROM_WS_PARAM}' = '{ws_id}');" - ) diff --git a/src/databricks/labs/ucx/hive_metastore/table_size.py b/src/databricks/labs/ucx/hive_metastore/table_size.py index fe87ae3b67..9451841cdc 100644 --- a/src/databricks/labs/ucx/hive_metastore/table_size.py +++ b/src/databricks/labs/ucx/hive_metastore/table_size.py @@ -71,8 +71,6 @@ def snapshot(self) -> list[TableSize]: def _safe_get_table_size(self, table_full_name: str) -> int | None: logger.debug(f"Evaluating {table_full_name} table size.") try: - # refresh table statistics to avoid stale stats in HMS - self._backend.execute(f"ANALYZE table {table_full_name} compute STATISTICS NOSCAN") # pylint: disable-next=protected-access return self._spark._jsparkSession.table(table_full_name).queryExecution().analyzed().stats().sizeInBytes() except Exception as e: # pylint: disable=broad-exception-caught diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py index 6fdbbeec1c..3513868ede 100644 --- a/src/databricks/labs/ucx/hive_metastore/tables.py +++ b/src/databricks/labs/ucx/hive_metastore/tables.py @@ -1,7 +1,7 @@ import logging import re import typing -from collections.abc import Iterable, Iterator +from collections.abc import Iterable from dataclasses import dataclass from enum import Enum, auto from functools import partial @@ -28,7 +28,6 @@ class What(Enum): DBFS_ROOT_DELTA = auto() DBFS_ROOT_NON_DELTA = auto() VIEW = auto() - TABLE_IN_MOUNT = auto() DB_DATASET = auto() UNKNOWN = auto() @@ -88,8 +87,6 @@ def is_delta(self) -> bool: @property def key(self) -> str: - if self.is_table_in_mount: - return f"{self.catalog}.{self.database}.{self.location}".lower() return f"{self.catalog}.{self.database}.{self.name}".lower() @property @@ -106,6 +103,16 @@ def __eq__(self, other): def kind(self) -> str: return "VIEW" if self.view_text is not None else "TABLE" + def sql_alter_to(self, target_table_key): + return f"ALTER {self.kind} {escape_sql_identifier(self.key)} SET TBLPROPERTIES ('upgraded_to' = '{target_table_key}');" + + def sql_alter_from(self, target_table_key, ws_id): + return ( + f"ALTER {self.kind} {escape_sql_identifier(target_table_key)} SET TBLPROPERTIES " + f"('upgraded_from' = '{self.key}'" + f" , '{self.UPGRADED_FROM_WS_PARAM}' = '{ws_id}');" + ) + def sql_unset_upgraded_to(self): return f"ALTER {self.kind} {escape_sql_identifier(self.key)} UNSET TBLPROPERTIES IF EXISTS('upgraded_to');" @@ -140,6 +147,14 @@ def is_format_supported_for_sync(self) -> bool: return False return self.table_format.upper() in {"DELTA", "PARQUET", "CSV", "JSON", "ORC", "TEXT", "AVRO"} + @property + def is_format_supported_for_create_like(self) -> bool: + # Based on documentation + # https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-create-table-like.html + if self.table_format is None: + return False + return self.table_format.upper() in {"DELTA", "PARQUET", "CSV", "JSON", "TEXT"} + @property def is_databricks_dataset(self) -> bool: if not self.location: @@ -149,16 +164,10 @@ def is_databricks_dataset(self) -> bool: return True return False - @property - def is_table_in_mount(self) -> bool: - return self.database.startswith("mounted_") and self.is_delta - @property def what(self) -> What: if self.is_databricks_dataset: return What.DB_DATASET - if self.is_table_in_mount: - return What.TABLE_IN_MOUNT if self.is_dbfs_root and self.table_format == "DELTA": return What.DBFS_ROOT_DELTA if self.is_dbfs_root: @@ -298,29 +307,6 @@ def sql_migrate_dbfs(self, target_table_key): def sql_migrate_view(self, target_table_key): return f"CREATE VIEW IF NOT EXISTS {escape_sql_identifier(target_table_key)} AS {self.view_text};" - def sql_migrate_table_in_mount(self, target_table_key: str, table_schema: Iterator[typing.Any]): - fields = [] - partitioned_fields = [] - next_fileds_are_partitioned = False - for key, value, _ in table_schema: - if key == "# Partition Information": - continue - if key == "# col_name": - next_fileds_are_partitioned = True - continue - if next_fileds_are_partitioned: - partitioned_fields.append(f"{key}") - else: - fields.append(f"{key} {value}") - - partitioned_str = "" - if partitioned_fields: - partitioning_columns = ", ".join(partitioned_fields) - partitioned_str = f"PARTITIONED BY ({partitioning_columns})" - schema = ", ".join(fields) - - return f"CREATE TABLE IF NOT EXISTS {escape_sql_identifier(target_table_key)} ({schema}) {partitioned_str} LOCATION '{self.location}';" - @dataclass class TableError: diff --git a/src/databricks/labs/ucx/hive_metastore/workflows.py b/src/databricks/labs/ucx/hive_metastore/workflows.py index 42d3fce3f3..3a594bdbe6 100644 --- a/src/databricks/labs/ucx/hive_metastore/workflows.py +++ b/src/databricks/labs/ucx/hive_metastore/workflows.py @@ -179,9 +179,9 @@ def migration_report(self, ctx: RuntimeContext): dashboard _before_ all tasks have been completed, but then only already completed information is shown.""" -class ScanTablesInMounts(Workflow): +class MigrateTablesInMounts(Workflow): def __init__(self): - super().__init__('scan-tables-in-mounts-experimental') + super().__init__('migrate-tables-in-mounts-experimental') @job_task def scan_tables_in_mounts_experimental(self, ctx: RuntimeContext): @@ -199,23 +199,3 @@ def refresh_migration_status(self, ctx: RuntimeContext): def migration_report(self, ctx: RuntimeContext): """Refreshes the migration dashboard after all previous tasks have been completed. Note that you can access the dashboard _before_ all tasks have been completed, but then only already completed information is shown.""" - - -class MigrateTablesInMounts(Workflow): - def __init__(self): - super().__init__('migrate-tables-in-mounts-experimental') - - @job_task(job_cluster="table_migration", depends_on=[ScanTablesInMounts.scan_tables_in_mounts_experimental]) - def migrate_tables_in_mounts_experimental(self, ctx: RuntimeContext): - """[EXPERIMENTAL] This workflow migrates `delta tables stored in mount points` to Unity Catalog using a Create Table statement.""" - ctx.tables_migrator.migrate_tables(what=What.TABLE_IN_MOUNT) - - @job_task(job_cluster="table_migration", depends_on=[migrate_tables_in_mounts_experimental]) - def refresh_migration_status(self, ctx: RuntimeContext): - """Refresh the migration status to present it in the dashboard.""" - ctx.tables_migrator.index_full_refresh() - - @job_task(dashboard="migration_main", depends_on=[refresh_migration_status]) - def migration_report(self, ctx: RuntimeContext): - """Refreshes the migration dashboard after all previous tasks have been completed. Note that you can access the - dashboard _before_ all tasks have been completed, but then only already completed information is shown.""" diff --git a/src/databricks/labs/ucx/install.py b/src/databricks/labs/ucx/install.py index 4e6d52b6cc..376453c12e 100644 --- a/src/databricks/labs/ucx/install.py +++ b/src/databricks/labs/ucx/install.py @@ -61,7 +61,6 @@ from databricks.labs.ucx.installer.mixins import InstallationMixin from databricks.labs.ucx.installer.policy import ClusterPolicyInstaller from databricks.labs.ucx.installer.workflows import WorkflowsDeployment -from databricks.labs.ucx.recon.migration_recon import ReconResult from databricks.labs.ucx.runtime import Workflows from databricks.labs.ucx.source_code.jobs import JobProblem from databricks.labs.ucx.workspace_access.base import Permissions @@ -106,7 +105,6 @@ def deploy_schema(sql_backend: SqlBackend, inventory_schema: str): functools.partial(table, "workflow_problems", JobProblem), functools.partial(table, "udfs", Udf), functools.partial(table, "logs", LogRecord), - functools.partial(table, "recon_results", ReconResult), ], ) deployer.deploy_view("objects", "queries/views/objects.sql") @@ -114,7 +112,6 @@ def deploy_schema(sql_backend: SqlBackend, inventory_schema: str): deployer.deploy_view("table_estimates", "queries/views/table_estimates.sql") deployer.deploy_view("misc_patterns", "queries/views/misc_patterns.sql") deployer.deploy_view("code_patterns", "queries/views/code_patterns.sql") - deployer.deploy_view("reconciliation_results", "queries/views/reconciliation_results.sql") def extract_major_minor(version_string): @@ -220,9 +217,6 @@ def _prompt_for_new_installation(self) -> WorkspaceConfig: f"Does given workspace {self.workspace_client.get_workspace_id()} " f"block Internet access?" ) trigger_job = self.prompts.confirm("Do you want to trigger assessment job after installation?") - recon_tolerance_percent = int( - self.prompts.question("Reconciliation threshold, in percentage", default="5", valid_number=True) - ) return WorkspaceConfig( inventory_database=inventory_database, workspace_group_regex=configure_groups.workspace_group_regex, diff --git a/src/databricks/labs/ucx/installer/policy.py b/src/databricks/labs/ucx/installer/policy.py index 64f501093c..9b73ec84e1 100644 --- a/src/databricks/labs/ucx/installer/policy.py +++ b/src/databricks/labs/ucx/installer/policy.py @@ -113,7 +113,6 @@ def _definition(self, conf: dict, instance_profile: str | None, instance_pool_id policy_definition["aws_attributes.availability"] = self._policy_config( compute.AwsAvailability.ON_DEMAND.value ) - policy_definition["aws_attributes.zone_id"] = self._policy_config("auto") elif self._ws.config.is_azure: policy_definition["azure_attributes.availability"] = self._policy_config( compute.AzureAvailability.ON_DEMAND_AZURE.value @@ -128,8 +127,6 @@ def _definition(self, conf: dict, instance_profile: str | None, instance_pool_id policy_definition.pop("node_type_id") # 'availability' cannot be supplied when an instance pool ID is provided policy_definition.pop("aws_attributes.availability", "") - # 'zone_id' cannot be supplied when an instance pool ID is provided - policy_definition.pop("aws_attributes.zone_id", "") policy_definition.pop("azure_attributes.availability", "") policy_definition.pop("gcp_attributes.availability", "") return json.dumps(policy_definition) diff --git a/src/databricks/labs/ucx/installer/workflows.py b/src/databricks/labs/ucx/installer/workflows.py index f5641d8f88..7895282888 100644 --- a/src/databricks/labs/ucx/installer/workflows.py +++ b/src/databricks/labs/ucx/installer/workflows.py @@ -52,7 +52,6 @@ logger = logging.getLogger(__name__) -TEST_JOBS_PURGE_TIMEOUT = timedelta(hours=1, minutes=15) EXTRA_TASK_PARAMS = { "job_id": "{{job_id}}", "run_id": "{{run_id}}", @@ -440,10 +439,6 @@ def _config_file(self): def _is_testing(self): return self._product_info.product_name() != "ucx" - @staticmethod - def _get_test_purge_time() -> str: - return (datetime.utcnow() + TEST_JOBS_PURGE_TIMEOUT).strftime("%Y%m%d%H") - def _create_readme(self) -> str: debug_notebook_link = self._installation.workspace_markdown_link('debug notebook', 'DEBUG.py') markdown = [ @@ -573,7 +568,6 @@ def _job_settings(self, step_name: str, remote_wheels: list[str]) -> dict[str, A email_notifications = jobs.JobEmailNotifications( on_success=[self._my_username], on_failure=[self._my_username] ) - job_tasks = [] job_clusters: set[str] = {Task.job_cluster} for task in self._tasks: @@ -586,14 +580,9 @@ def _job_settings(self, step_name: str, remote_wheels: list[str]) -> dict[str, A job_tasks.append(self._job_parse_logs_task(job_tasks, step_name, remote_wheels)) version = self._product_info.version() version = version if not self._ws.config.is_gcp else version.replace("+", "-") - tags = {"version": f"v{version}"} - if self._is_testing(): - # add RemoveAfter tag for test job cleanup - date_to_remove = self._get_test_purge_time() - tags.update({"RemoveAfter": date_to_remove}) return { "name": self._name(step_name), - "tags": tags, + "tags": {"version": f"v{version}"}, "job_clusters": self._job_clusters(job_clusters), "email_notifications": email_notifications, "tasks": job_tasks, diff --git a/src/databricks/labs/ucx/mixins/fixtures.py b/src/databricks/labs/ucx/mixins/fixtures.py index 94a1ad5448..43b9373980 100644 --- a/src/databricks/labs/ucx/mixins/fixtures.py +++ b/src/databricks/labs/ucx/mixins/fixtures.py @@ -46,8 +46,6 @@ Dashboard, WidgetOptions, WidgetPosition, - EndpointTagPair, - EndpointTags, ) from databricks.sdk.service.workspace import ImportFormat, Language @@ -57,7 +55,7 @@ # pylint: disable=redefined-outer-name,too-many-try-statements,import-outside-toplevel,unnecessary-lambda,too-complex,invalid-name logger = logging.getLogger(__name__) -TEST_JOBS_PURGE_TIMEOUT = timedelta(hours=1, minutes=15) # 15 minutes grace for jobs starting at the end of the hour +JOBS_PURGE_TIMEOUT = timedelta(days=1) def factory(name, create, remove): @@ -723,10 +721,6 @@ def create( if "instance_pool_id" not in kwargs: kwargs["node_type_id"] = ws.clusters.select_node_type(local_disk=True, min_memory_gb=16) - if "custom_tags" not in kwargs: - kwargs["custom_tags"] = {"RemoveAfter": get_test_purge_time()} - else: - kwargs["custom_tags"]["RemoveAfter"] = get_test_purge_time() return ws.clusters.create( cluster_name=cluster_name, spark_version=spark_version, @@ -767,9 +761,7 @@ def create(*, instance_pool_name=None, node_type_id=None, **kwargs): instance_pool_name = f"sdk-{make_random(4)}" if node_type_id is None: node_type_id = ws.clusters.select_node_type(local_disk=True, min_memory_gb=16) - return ws.instance_pools.create( - instance_pool_name, node_type_id, custom_tags={"RemoveAfter": get_test_purge_time()}, **kwargs - ) + return ws.instance_pools.create(instance_pool_name, node_type_id, **kwargs) yield from factory("instance pool", create, lambda item: ws.instance_pools.delete(item.instance_pool_id)) @@ -809,7 +801,7 @@ def create(notebook_path: str | Path | None = None, **kwargs): ] # add RemoveAfter tag for test job cleanup - date_to_remove = get_test_purge_time() + date_to_remove = (datetime.now() + JOBS_PURGE_TIMEOUT).strftime("%Y-%m-%d") remove_after_tag = {"key": "RemoveAfter", "value": date_to_remove} if 'tags' not in kwargs: @@ -882,14 +874,12 @@ def create( if cluster_size is None: cluster_size = "2X-Small" - remove_after_tags = EndpointTags(custom_tags=[EndpointTagPair(key="RemoveAfter", value=get_test_purge_time())]) return ws.warehouses.create( name=warehouse_name, cluster_size=cluster_size, warehouse_type=warehouse_type, max_num_clusters=max_num_clusters, enable_serverless_compute=enable_serverless_compute, - tags=remove_after_tags, **kwargs, ) @@ -974,7 +964,7 @@ def create(*, catalog_name: str = "hive_metastore", name: str | None = None) -> if name is None: name = f"ucx_S{make_random(4)}".lower() full_name = f"{catalog_name}.{name}".lower() - sql_backend.execute(f"CREATE SCHEMA {full_name} WITH DBPROPERTIES (RemoveAfter={get_test_purge_time()})") + sql_backend.execute(f"CREATE SCHEMA {full_name}") schema_info = SchemaInfo(catalog_name=catalog_name, name=name, full_name=full_name) logger.info( f"Schema {schema_info.full_name}: " @@ -997,7 +987,7 @@ def remove(schema_info: SchemaInfo): @pytest.fixture # pylint: disable-next=too-many-statements def make_table(ws, sql_backend, make_schema, make_random) -> Generator[Callable[..., TableInfo], None, None]: - def create( # pylint: disable=too-many-locals,too-many-arguments,too-many-statements + def create( # pylint: disable=too-many-locals,too-many-arguments *, catalog_name="hive_metastore", schema_name: str | None = None, @@ -1006,7 +996,6 @@ def create( # pylint: disable=too-many-locals,too-many-arguments,too-many-state non_delta: bool = False, external: bool = False, external_csv: str | None = None, - external_delta: str | None = None, view: bool = False, tbl_properties: dict[str, str] | None = None, hiveserde_ddl: str | None = None, @@ -1044,11 +1033,6 @@ def create( # pylint: disable=too-many-locals,too-many-arguments,too-many-state data_source_format = DataSourceFormat.CSV storage_location = external_csv ddl = f"{ddl} USING CSV OPTIONS (header=true) LOCATION '{storage_location}'" - elif external_delta is not None: - table_type = TableType.EXTERNAL - data_source_format = DataSourceFormat.DELTA - storage_location = external_delta - ddl = f"{ddl} (id string) LOCATION '{storage_location}'" elif external: # external table table_type = TableType.EXTERNAL @@ -1063,19 +1047,7 @@ def create( # pylint: disable=too-many-locals,too-many-arguments,too-many-state storage_location = f"dbfs:/user/hive/warehouse/{schema_name}/{name}" ddl = f"{ddl} (id INT, value STRING)" if tbl_properties: - tbl_properties.update({"RemoveAfter": get_test_purge_time()}) - else: - tbl_properties = {"RemoveAfter": get_test_purge_time()} - - str_properties = ",".join([f" '{k}' = '{v}' " for k, v in tbl_properties.items()]) - - # table properties fails with CTAS statements - alter_table_tbl_properties = "" - if ctas or non_delta: - alter_table_tbl_properties = ( - f'ALTER {"VIEW" if view else "TABLE"} {full_name} SET TBLPROPERTIES ({str_properties})' - ) - else: + str_properties = ",".join([f" '{k}' = '{v}' " for k, v in tbl_properties.items()]) ddl = f"{ddl} TBLPROPERTIES ({str_properties})" if hiveserde_ddl: @@ -1085,11 +1057,6 @@ def create( # pylint: disable=too-many-locals,too-many-arguments,too-many-state storage_location = storage_override sql_backend.execute(ddl) - - # CTAS AND NON_DELTA does not support TBLPROPERTIES - if ctas or non_delta: - sql_backend.execute(alter_table_tbl_properties) - table_info = TableInfo( catalog_name=catalog_name, schema_name=schema_name, @@ -1370,7 +1337,3 @@ def remove(dashboard: Dashboard): logger.info(f"Can't delete dashboard {e}") yield from factory("dashboard", create, remove) - - -def get_test_purge_time() -> str: - return (datetime.utcnow() + TEST_JOBS_PURGE_TIMEOUT).strftime("%Y%m%d%H") diff --git a/src/databricks/labs/ucx/queries/migration/main/01_0_data_object_migration_status.md b/src/databricks/labs/ucx/queries/migration/main/01_0_data_object_migration_status.md index fa0352a2e6..c1fbcf9be0 100644 --- a/src/databricks/labs/ucx/queries/migration/main/01_0_data_object_migration_status.md +++ b/src/databricks/labs/ucx/queries/migration/main/01_0_data_object_migration_status.md @@ -1,19 +1,12 @@ --- widget title=Table migration status, row=1, col=0, size_x=4, size_y=8 +-- widget title=Table migration status, row=1, col=0, size_x=2, size_y=8 -## Table migration status +## 1 - Table migration status -The two widgets on the right show high-level summary of the table migration. The first widget shows the migration -progress, and the second widget shows the data reconciliation results. - - -The table below assists with verifying if, how the tables are migrated and their correctness. It can be filtered on the -table name and migration status. Next to table metadata, the table shows: +The table on the right assist with verifying if and how the tables are migrated. It can be filtered on the table name +and migration status. Next to table metadata, the table shows: - The table name before migrating - The migration status -- If migrated - - The table name after migrating - - Whether schema matches and whether data matches - - Number of rows in the original table and the target table - - Number of rows that are in the original table but not in the target table, and vice versa (only when compare_rows is `True`) +- If migrated, the table name after migrating -The table migration status is updated after running [table migration workflow](https://github.com/databrickslabs/ucx/blob/main/README.md#table-migration-workflow). +The table migration status is updated after running +[table migration workflow](https://github.com/databrickslabs/ucx/blob/main/README.md#table-migration-workflow). diff --git a/src/databricks/labs/ucx/queries/migration/main/01_1_data_object_migration_status.sql b/src/databricks/labs/ucx/queries/migration/main/01_1_data_object_migration_status.sql new file mode 100644 index 0000000000..6ce5d4e2a3 --- /dev/null +++ b/src/databricks/labs/ucx/queries/migration/main/01_1_data_object_migration_status.sql @@ -0,0 +1,20 @@ +-- viz type=table, name=Migration status, search_by=table_name,upgraded_status, columns=table_name,object_type,table_format,location,upgraded_status,upgraded_to_table_name,view_text,storage_properties,is_partitioned +-- widget title=Migration status, row=1, col=2, size_x=4, size_y=8 +SELECT + concat_ws('.', tables.`catalog`, tables.`database`, tables.name) AS table_name, + tables.object_type, + tables.table_format, + tables.location, + CASE + WHEN migration_status.dst_table IS NOT NULL THEN 'Yes' + ELSE 'No' + END AS upgraded_status, + concat_ws('.', migration_status.dst_catalog, migration_status.dst_schema, migration_status.dst_table) AS upgraded_to_table_name, + tables.view_text, + tables.storage_properties, + tables.is_partitioned +FROM + $inventory.tables AS tables + LEFT JOIN + $inventory.migration_status AS migration_status + ON tables.`database` = migration_status.src_schema AND tables.name = migration_status.src_table \ No newline at end of file diff --git a/src/databricks/labs/ucx/queries/migration/main/01_1_data_object_migration_summary.sql b/src/databricks/labs/ucx/queries/migration/main/01_1_data_object_migration_summary.sql deleted file mode 100644 index 6c5d57798d..0000000000 --- a/src/databricks/labs/ucx/queries/migration/main/01_1_data_object_migration_summary.sql +++ /dev/null @@ -1,16 +0,0 @@ --- viz type=counter, name=Data Migration Progress, counter_label=Table migration progress, value_column=migrated_rate --- widget row=1, col=4, size_x=2, size_y=4 -SELECT - COUNT( - CASE - WHEN migration_status.dst_table IS NOT NULL THEN 1 - ELSE NULL - END - ) AS migrated, - count(*) AS total, - concat(round(migrated / total * 100, 2), '%') AS migrated_rate -FROM - $inventory.tables AS tables - LEFT JOIN - $inventory.migration_status AS migration_status - ON tables.`database` = migration_status.src_schema AND tables.name = migration_status.src_table \ No newline at end of file diff --git a/src/databricks/labs/ucx/queries/migration/main/02_0_code_compatibility_problems.md b/src/databricks/labs/ucx/queries/migration/main/02_0_code_compatibility_problems.md deleted file mode 100644 index 75e97b24b2..0000000000 --- a/src/databricks/labs/ucx/queries/migration/main/02_0_code_compatibility_problems.md +++ /dev/null @@ -1,11 +0,0 @@ --- widget title=Workflow migration problems, row=2, col=0, size_x=2, size_y=8 - -## 2 - Code compatibility problems - -The table on the right assist with verifying if workflows are Unity Catalog compatible. It can be filtered on the path, -problem code and workflow name. The table: -- Points to a problem detected in the code using the code path, workflow & task reference and start/end line & column; -- Explains the problem with a human-readable message and a code. - -The code compatibility problems are updated after running -[Jobs Static Code Analysis Workflow](https://github.com/databrickslabs/ucx/blob/main/README.md#workflow-linter-workflow). diff --git a/src/databricks/labs/ucx/queries/migration/main/02_1_code_compatibility_problems.sql b/src/databricks/labs/ucx/queries/migration/main/02_1_code_compatibility_problems.sql deleted file mode 100644 index 70994baca7..0000000000 --- a/src/databricks/labs/ucx/queries/migration/main/02_1_code_compatibility_problems.sql +++ /dev/null @@ -1,14 +0,0 @@ --- viz type=table, name=Workflow migration problems, search_by=path,code,job_name, columns=path,code,message,workflow_id,workflow_name,task_key,start_line,start_col,end_line,end_col --- widget title=Workflow migration problems, row=2, col=2, size_x=4, size_y=8 -SELECT - path, - code, - message, - job_id AS workflow_id, - job_name AS workflow_name, - task_key, - start_line, - start_col, - end_line, - end_col -FROM $inventory.workflow_problems \ No newline at end of file diff --git a/src/databricks/labs/ucx/queries/migration/main/02_1_data_reconciliation_summary.sql b/src/databricks/labs/ucx/queries/migration/main/02_1_data_reconciliation_summary.sql deleted file mode 100644 index a0acfd69aa..0000000000 --- a/src/databricks/labs/ucx/queries/migration/main/02_1_data_reconciliation_summary.sql +++ /dev/null @@ -1,14 +0,0 @@ --- viz type=counter, name=Data Reconciliation Results, counter_label=Table migration success rate, value_column=success_rate --- widget row=2, col=4, size_x=2, size_y=4 -SELECT - COUNT( - CASE - WHEN schema_matches - AND data_matches THEN 1 - ELSE NULL - END - ) AS success, - count(*) AS total, - concat(round(success / total * 100, 2), '%') AS success_rate -FROM - $inventory.reconciliation_results \ No newline at end of file diff --git a/src/databricks/labs/ucx/queries/migration/main/03_1_data_reconciliation_status.sql b/src/databricks/labs/ucx/queries/migration/main/03_1_data_reconciliation_status.sql deleted file mode 100644 index 34e68e252a..0000000000 --- a/src/databricks/labs/ucx/queries/migration/main/03_1_data_reconciliation_status.sql +++ /dev/null @@ -1,31 +0,0 @@ --- viz type=table, name=Migration status, search_by=table_name,upgraded_status, columns=table_name,object_type,table_format,location,upgraded_status,upgraded_to_table_name,schema_matches,column_comparison,data_matches,source_row_count,target_row_count,source_missing_count,target_missing_count,reconciliation_error,view_text,storage_properties,is_partitioned --- widget title=Migration status, row=3, col=0, size_x=8, size_y=16 -SELECT - concat_ws('.', tables.`catalog`, tables.`database`, tables.name) AS table_name, - tables.object_type, - tables.table_format, - tables.location, - CASE - WHEN migration_status.dst_table IS NOT NULL THEN 'Yes' - ELSE 'No' - END AS upgraded_status, - concat_ws('.', migration_status.dst_catalog, migration_status.dst_schema, migration_status.dst_table) AS upgraded_to_table_name, - reconciliation_results.schema_matches, - reconciliation_results.column_comparison, - reconciliation_results.data_matches, - reconciliation_results.source_row_count, - reconciliation_results.target_row_count, - reconciliation_results.source_missing_count, - reconciliation_results.target_missing_count, - reconciliation_results.error_message as reconciliation_error, - tables.view_text, - tables.storage_properties, - tables.is_partitioned -FROM - $inventory.tables AS tables - LEFT JOIN - $inventory.migration_status AS migration_status - ON tables.`database` = migration_status.src_schema AND tables.name = migration_status.src_table - LEFT JOIN - $inventory.reconciliation_results AS reconciliation_results - ON tables.`database` = reconciliation_results.src_schema AND tables.name = reconciliation_results.src_table \ No newline at end of file diff --git a/src/databricks/labs/ucx/queries/views/reconciliation_results.sql b/src/databricks/labs/ucx/queries/views/reconciliation_results.sql deleted file mode 100644 index ef60632235..0000000000 --- a/src/databricks/labs/ucx/queries/views/reconciliation_results.sql +++ /dev/null @@ -1,30 +0,0 @@ -WITH flattened AS ( - SELECT - *, - from_json( - schema_comparison, - 'STRUCT>, is_matching: BOOLEAN>' - ) AS schema_comparison_result, - from_json( - data_comparison, - 'STRUCT' - ) AS data_comparison_result - FROM - $inventory.recon_results -) -SELECT - src_schema, - src_table, - dst_catalog, - dst_schema, - dst_table, - schema_matches, - data_matches, - data_comparison_result.source_row_count, - data_comparison_result.target_row_count, - data_comparison_result.source_missing_count, - data_comparison_result.target_missing_count, - schema_comparison_result.data AS column_comparison, - error_message -FROM - flattened \ No newline at end of file diff --git a/src/databricks/labs/ucx/recon/base.py b/src/databricks/labs/ucx/recon/base.py index 9471c5a84d..1e4de9ecd6 100644 --- a/src/databricks/labs/ucx/recon/base.py +++ b/src/databricks/labs/ucx/recon/base.py @@ -1,4 +1,3 @@ -import dataclasses from abc import ABC, abstractmethod from dataclasses import dataclass @@ -65,19 +64,13 @@ class SchemaComparisonResult: is_matching: bool data: list[SchemaComparisonEntry] - def as_dict(self): - return dataclasses.asdict(self) - @dataclass class DataComparisonResult: source_row_count: int target_row_count: int - target_missing_count: int = 0 - source_missing_count: int = 0 - - def as_dict(self): - return dataclasses.asdict(self) + num_missing_records_in_target: int + num_missing_records_in_source: int class TableMetadataRetriever(ABC): @@ -106,12 +99,7 @@ def compare_schema(self, source: TableIdentifier, target: TableIdentifier) -> Sc class DataComparator(ABC): @abstractmethod - def compare_data( - self, - source: TableIdentifier, - target: TableIdentifier, - compare_rows: bool, - ) -> DataComparisonResult: + def compare_data(self, source: TableIdentifier, target: TableIdentifier) -> DataComparisonResult: """ Compare data for two tables """ diff --git a/src/databricks/labs/ucx/recon/data_comparator.py b/src/databricks/labs/ucx/recon/data_comparator.py index 64c0896460..98a91d3b0f 100644 --- a/src/databricks/labs/ucx/recon/data_comparator.py +++ b/src/databricks/labs/ucx/recon/data_comparator.py @@ -25,11 +25,11 @@ class StandardDataComparator(DataComparator): CASE WHEN target.hash_value IS NULL THEN 1 ELSE 0 - END AS target_missing_count, + END AS num_missing_records_in_target, CASE WHEN source.hash_value IS NULL THEN 1 ELSE 0 - END AS source_missing_count + END AS num_missing_records_in_source FROM ( SELECT {source_hash_expr} AS hash_value FROM {source_table_fqn} @@ -42,8 +42,8 @@ class StandardDataComparator(DataComparator): ) SELECT COUNT(*) AS total_mismatches, - COALESCE(SUM(target_missing_count), 0) AS target_missing_count, - COALESCE(SUM(source_missing_count), 0) AS source_missing_count + COALESCE(SUM(num_missing_records_in_target), 0) AS num_missing_records_in_target, + COALESCE(SUM(num_missing_records_in_source), 0) AS num_missing_records_in_source FROM compare_results WHERE is_match IS FALSE; """ @@ -52,12 +52,7 @@ def __init__(self, sql_backend: SqlBackend, data_profiler: DataProfiler): self._sql_backend = sql_backend self._data_profiler = data_profiler - def compare_data( - self, - source: TableIdentifier, - target: TableIdentifier, - compare_rows: bool = False, - ) -> DataComparisonResult: + def compare_data(self, source: TableIdentifier, target: TableIdentifier) -> DataComparisonResult: """ This method compares the data of two tables. It takes two TableIdentifier objects as input, which represent the source and target tables for which the data are to be compared. @@ -68,24 +63,19 @@ def compare_data( """ source_data_profile = self._data_profiler.profile_data(source) target_data_profile = self._data_profiler.profile_data(target) - if not compare_rows: - return DataComparisonResult( - source_row_count=source_data_profile.row_count, - target_row_count=target_data_profile.row_count, - ) comparison_query = StandardDataComparator.build_data_comparison_query( source_data_profile, target_data_profile, ) query_result: Iterator[Row] = self._sql_backend.fetch(comparison_query) count_row = next(query_result) - target_missing_count = int(count_row["target_missing_count"]) - source_missing_count = int(count_row["source_missing_count"]) + num_missing_records_in_target = int(count_row["num_missing_records_in_target"]) + num_missing_records_in_source = int(count_row["num_missing_records_in_source"]) return DataComparisonResult( source_row_count=source_data_profile.row_count, target_row_count=target_data_profile.row_count, - target_missing_count=target_missing_count, - source_missing_count=source_missing_count, + num_missing_records_in_target=num_missing_records_in_target, + num_missing_records_in_source=num_missing_records_in_source, ) @classmethod diff --git a/src/databricks/labs/ucx/recon/migration_recon.py b/src/databricks/labs/ucx/recon/migration_recon.py deleted file mode 100644 index 164a2c4ddc..0000000000 --- a/src/databricks/labs/ucx/recon/migration_recon.py +++ /dev/null @@ -1,149 +0,0 @@ -import json -import logging -from collections.abc import Iterable -from dataclasses import dataclass -from functools import partial - -from databricks.sdk.errors import DatabricksError -from databricks.labs.blueprint.parallel import Threads -from databricks.labs.lsql.backends import SqlBackend -from databricks.labs.ucx.framework.crawlers import CrawlerBase -from databricks.labs.ucx.hive_metastore.mapping import TableMapping -from databricks.labs.ucx.hive_metastore.migration_status import MigrationStatusRefresher -from databricks.labs.ucx.recon.base import ( - DataComparator, - SchemaComparator, - TableIdentifier, - DataComparisonResult, -) - -logger = logging.getLogger(__name__) - - -@dataclass -class ReconResult: - src_schema: str - src_table: str - dst_catalog: str - dst_schema: str - dst_table: str - schema_matches: bool = False - data_matches: bool = False - schema_comparison: str | None = None - data_comparison: str | None = None - error_message: str | None = None - - -class MigrationRecon(CrawlerBase[ReconResult]): - def __init__( - self, - sbe: SqlBackend, - schema: str, - migration_status_refresher: MigrationStatusRefresher, - table_mapping: TableMapping, - schema_comparator: SchemaComparator, - data_comparator: DataComparator, - default_threshold: float, - ): - super().__init__(sbe, "hive_metastore", schema, "recon_results", ReconResult) - self._migration_status_refresher = migration_status_refresher - self._table_mapping = table_mapping - self._schema_comparator = schema_comparator - self._data_comparator = data_comparator - self._default_threshold = default_threshold - - def snapshot(self) -> Iterable[ReconResult]: - return self._snapshot(self._try_fetch, self._crawl) - - def _crawl(self) -> Iterable[ReconResult]: - self._migration_status_refresher.reset() - migration_index = self._migration_status_refresher.index() - migration_rules = self._table_mapping.load() - tasks = [] - for source in migration_index.snapshot(): - migration_status = migration_index.get(*source) - if migration_status is None: - continue - if not migration_status.dst_catalog or not migration_status.dst_schema or not migration_status.dst_table: - continue - source_table = TableIdentifier( - "hive_metastore", - migration_status.src_schema, - migration_status.src_table, - ) - target_table = TableIdentifier( - migration_status.dst_catalog, - migration_status.dst_schema, - migration_status.dst_table, - ) - compare_rows = False - recon_tolerance_percent = self._default_threshold - for rule in migration_rules: - if rule.match(source_table): - compare_rows = rule.compare_rows - recon_tolerance_percent = rule.recon_tolerance_percent - break - tasks.append( - partial( - self._recon, - source_table, - target_table, - compare_rows, - recon_tolerance_percent, - ) - ) - recon_results, errors = Threads.gather("reconciling data", tasks) - if len(errors) > 0: - logger.error(f"Detected {len(errors)} while reconciling data") - return recon_results - - def _recon( - self, - source: TableIdentifier, - target: TableIdentifier, - compare_rows: bool, - recon_tolerance_percent: int, - ) -> ReconResult | None: - try: - schema_comparison = self._schema_comparator.compare_schema(source, target) - data_comparison = self._data_comparator.compare_data(source, target, compare_rows) - except DatabricksError as e: - logger.warning(f"Error while comparing {source.fqn_escaped} and {target.fqn_escaped}: {e}") - return ReconResult( - source.schema, - source.table, - target.catalog, - target.schema, - target.table, - error_message=str(e), - ) - recon_results = ReconResult( - source.schema, - source.table, - target.catalog, - target.schema, - target.table, - schema_comparison.is_matching, - ( - self._percentage_difference(data_comparison) <= recon_tolerance_percent - and data_comparison.target_missing_count == 0 - and data_comparison.source_missing_count == 0 - ), - json.dumps(schema_comparison.as_dict()), - json.dumps(data_comparison.as_dict()), - ) - return recon_results - - @staticmethod - def _percentage_difference(result: DataComparisonResult) -> int: - source_row_count = result.source_row_count - target_row_count = result.target_row_count - if source_row_count == 0 and target_row_count == 0: - return 0 - if source_row_count == 0 and target_row_count > 0: - return 100 - return round(abs(source_row_count - target_row_count) / source_row_count * 100) - - def _try_fetch(self) -> Iterable[ReconResult]: - for row in self._fetch(f"SELECT * FROM {self._schema}.{self._table}"): - yield ReconResult(*row) diff --git a/src/databricks/labs/ucx/recon/workflows.py b/src/databricks/labs/ucx/recon/workflows.py deleted file mode 100644 index 8d8c7e0187..0000000000 --- a/src/databricks/labs/ucx/recon/workflows.py +++ /dev/null @@ -1,21 +0,0 @@ -from databricks.labs.ucx.contexts.workflow_task import RuntimeContext -from databricks.labs.ucx.framework.tasks import Workflow, job_task - - -class MigrationRecon(Workflow): - def __init__(self): - super().__init__('migrate-data-reconciliation') - - @job_task(job_cluster="table_migration") - def recon_migration_result(self, ctx: RuntimeContext): - """This workflow validate post-migration datasets against their pre-migration counterparts. This includes all - tables, by comparing their schema, row counts and row comparison - """ - # need to delete the existing content of recon results table, so that snapshot will re-populate it - ctx.migration_recon.reset() - ctx.migration_recon.snapshot() - - @job_task(depends_on=[recon_migration_result], dashboard="migration_main") - def reconciliation_report(self, ctx: RuntimeContext): - """Refreshes the migration dashboard after all previous tasks have been completed. Note that you can access the - dashboard _before_ all tasks have been completed, but then only already completed information is shown.""" diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py index 8c015630d2..3e232ceeab 100644 --- a/src/databricks/labs/ucx/runtime.py +++ b/src/databricks/labs/ucx/runtime.py @@ -10,13 +10,11 @@ from databricks.labs.ucx.framework.tasks import Task, Workflow, parse_args from databricks.labs.ucx.installer.logs import TaskLogger from databricks.labs.ucx.hive_metastore.workflows import ( - ScanTablesInMounts, - TableMigration, MigrateTablesInMounts, + TableMigration, MigrateHiveSerdeTablesInPlace, MigrateExternalTablesCTAS, ) -from databricks.labs.ucx.recon.workflows import MigrationRecon from databricks.labs.ucx.source_code.workflows import ExperimentalWorkflowLinter from databricks.labs.ucx.workspace_access.workflows import ( GroupMigration, @@ -51,11 +49,9 @@ def all(cls): MigrateExternalTablesCTAS(), ValidateGroupPermissions(), RemoveWorkspaceLocalGroups(), - ScanTablesInMounts(), MigrateTablesInMounts(), PermissionsMigrationAPI(), ExperimentalWorkflowLinter(), - MigrationRecon(), Failing(), ] ) diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py index 7ca4c51c82..83f3326214 100644 --- a/src/databricks/labs/ucx/source_code/base.py +++ b/src/databricks/labs/ucx/source_code/base.py @@ -3,8 +3,6 @@ from abc import abstractmethod from collections.abc import Iterable from dataclasses import dataclass -from pathlib import Path - # Code mapping between LSP, PyLint, and our own diagnostics: # | LSP | PyLint | Our | @@ -56,15 +54,6 @@ def as_deprecation(self) -> 'Deprecation': def as_convention(self) -> 'Convention': return Convention(**self.__dict__) - def for_path(self, path: Path) -> LocatedAdvice: - return LocatedAdvice(self, path) - - -@dataclass -class LocatedAdvice: - advice: Advice - path: Path - class Advisory(Advice): """A warning that does not prevent the code from running.""" diff --git a/src/databricks/labs/ucx/source_code/files.py b/src/databricks/labs/ucx/source_code/files.py index 3c346a6895..763f9a94ce 100644 --- a/src/databricks/labs/ucx/source_code/files.py +++ b/src/databricks/labs/ucx/source_code/files.py @@ -1,17 +1,10 @@ from __future__ import annotations # for type hints import logging -from collections.abc import Iterable, Callable from pathlib import Path -import sys -from typing import TextIO -from databricks.labs.ucx.source_code.base import LocatedAdvice -from databricks.labs.ucx.source_code.notebooks.sources import FileLinter, SUPPORTED_EXTENSION_LANGUAGES from databricks.labs.ucx.source_code.path_lookup import PathLookup -from databricks.labs.ucx.source_code.whitelist import Whitelist, UCCompatibility from databricks.sdk.service.workspace import Language -from databricks.labs.blueprint.tui import Prompts from databricks.labs.ucx.source_code.languages import Languages from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage @@ -24,7 +17,6 @@ DependencyProblem, MaybeDependency, SourceContainer, - DependencyResolver, ) logger = logging.getLogger(__name__) @@ -39,13 +31,10 @@ def __init__(self, path: Path, source: str, language: Language): self._language = CellLanguage.of_language(language) def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: - if self._language is CellLanguage.PYTHON: - return parent.build_graph_from_python_source(self._original_code) - # supported language that does not generate dependencies - if self._language is CellLanguage.SQL: + if self._language is not CellLanguage.PYTHON: + logger.warning(f"Unsupported language: {self._language.language}") return [] - logger.warning(f"Unsupported language: {self._language.language}") - return [] + return parent.build_graph_from_python_source(self._original_code) @property def path(self) -> Path: @@ -55,95 +44,12 @@ def __repr__(self): return f"" -class Folder(SourceContainer): - def __init__(self, path: Path, file_loader: FileLoader, folder_loader: FolderLoader): - self._path = path - self._file_loader = file_loader - self._folder_loader = folder_loader - - @property - def path(self) -> Path: - return self._path - - def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: - # don't directly scan non-source directories, let it be done for relevant imports only - if self._path.name in {"__pycache__", ".git", ".github", ".venv", ".mypy_cache", "site-packages"}: - return [] - return list(self._build_dependency_graph(parent)) - - def _build_dependency_graph(self, parent: DependencyGraph) -> Iterable[DependencyProblem]: - for child_path in self._path.iterdir(): - loader = self._folder_loader if child_path.is_dir() else self._file_loader - dependency = Dependency(loader, child_path) - yield from parent.register_dependency(dependency).problems - - def __repr__(self): - return f"" - - -class LocalCodeLinter: - - def __init__( - self, - file_loader: FileLoader, - folder_loader: FolderLoader, - path_lookup: PathLookup, - dependency_resolver: DependencyResolver, - languages_factory: Callable[[], Languages], - ) -> None: - self._file_loader = file_loader - self._folder_loader = folder_loader - self._path_lookup = path_lookup - self._dependency_resolver = dependency_resolver - self._extensions = {".py": Language.PYTHON, ".sql": Language.SQL} - self._languages_factory = languages_factory - - def lint(self, prompts: Prompts, path: Path | None, stdout: TextIO = sys.stdout) -> list[LocatedAdvice]: - """Lint local code files looking for problems in notebooks and python files.""" - if path is None: - response = prompts.question( - "Which file or directory do you want to lint?", - default=Path.cwd().as_posix(), - validate=lambda p_: Path(p_).exists(), - ) - path = Path(response) - located_advices = list(self._lint(path)) - for located in located_advices: - advice_path = located.path.relative_to(path) - advice = located.advice - message = ( - f"{advice_path.as_posix()}:{advice.start_line}:{advice.start_col}: [{advice.code}] {advice.message}\n" - ) - stdout.write(message) - return located_advices - - def _lint(self, path: Path) -> Iterable[LocatedAdvice]: - loader = self._folder_loader if path.is_dir() else self._file_loader - dependency = Dependency(loader, path) - graph = DependencyGraph(dependency, None, self._dependency_resolver, self._path_lookup) - container = dependency.load(self._path_lookup) - assert container is not None # because we just created it - problems = container.build_dependency_graph(graph) - for problem in problems: - problem_path = Path('UNKNOWN') if problem.is_path_missing() else problem.source_path.absolute() - yield problem.as_advisory().for_path(problem_path) - for child_path in graph.all_paths: - yield from self._lint_one(child_path) - - def _lint_one(self, path: Path) -> Iterable[LocatedAdvice]: - if path.is_dir(): - return [] - languages = self._languages_factory() - linter = FileLinter(languages, path) - return [advice.for_path(path) for advice in linter.lint()] - - class LocalFileMigrator: """The LocalFileMigrator class is responsible for fixing code files based on their language.""" - def __init__(self, languages_factory: Callable[[], Languages]): + def __init__(self, languages: Languages): + self._languages = languages self._extensions = {".py": Language.PYTHON, ".sql": Language.SQL} - self._languages_factory = languages_factory def apply(self, path: Path) -> bool: if path.is_dir(): @@ -166,8 +72,7 @@ def _apply_file_fix(self, path): return False logger.info(f"Analysing {path}") # Get the linter for the language - languages = self._languages_factory() - linter = languages.linter(language) + linter = self._languages.linter(language) # Open the file and read the code with path.open("r") as f: code = f.read() @@ -175,7 +80,7 @@ def _apply_file_fix(self, path): # Lint the code and apply fixes for advice in linter.lint(code): logger.info(f"Found: {advice}") - fixer = languages.fixer(language, advice.code) + fixer = self._languages.fixer(language, advice.code) if not fixer: continue logger.info(f"Applying fix for {advice}") @@ -190,31 +95,12 @@ def _apply_file_fix(self, path): return True -class StubContainer(SourceContainer): - - def __init__(self, path: Path): - super().__init__() - self._path = path - - def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: - return [] - - class FileLoader(DependencyLoader): def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> SourceContainer | None: absolute_path = path_lookup.resolve(dependency.path) if not absolute_path: return None - language = SUPPORTED_EXTENSION_LANGUAGES.get(absolute_path.suffix.lower()) - if not language: - return StubContainer(absolute_path) - for encoding in ("utf-8", "ascii"): - try: - code = absolute_path.read_text(encoding) - return LocalFile(absolute_path, code, language) - except UnicodeDecodeError: - pass - return None + return LocalFile(absolute_path, absolute_path.read_text("utf-8"), Language.PYTHON) def exists(self, path: Path) -> bool: return path.exists() @@ -223,24 +109,14 @@ def __repr__(self): return "FileLoader()" -class FolderLoader(FileLoader): +class LocalFileResolver(BaseImportResolver, BaseFileResolver): - def __init__(self, file_loader: FileLoader): + def __init__(self, file_loader: FileLoader, next_resolver: BaseImportResolver | None = None): + super().__init__(next_resolver) self._file_loader = file_loader - def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> SourceContainer | None: - absolute_path = path_lookup.resolve(dependency.path) - if not absolute_path: - return None - return Folder(absolute_path, self._file_loader, self) - - -class ImportFileResolver(BaseImportResolver, BaseFileResolver): - - def __init__(self, file_loader: FileLoader, whitelist: Whitelist): - super().__init__() - self._whitelist = whitelist - self._file_loader = file_loader + def with_next_resolver(self, resolver: BaseImportResolver) -> BaseImportResolver: + return LocalFileResolver(self._file_loader, resolver) def resolve_local_file(self, path_lookup, path: Path) -> MaybeDependency: absolute_path = path_lookup.resolve(path) @@ -250,29 +126,6 @@ def resolve_local_file(self, path_lookup, path: Path) -> MaybeDependency: return MaybeDependency(None, [problem]) def resolve_import(self, path_lookup: PathLookup, name: str) -> MaybeDependency: - maybe = self._resolve_whitelist(name) - if maybe is not None: - return maybe - maybe = self._resolve_import(path_lookup, name) - if maybe is not None: - return maybe - return self._fail('import-not-found', f"Could not locate import: {name}") - - def _resolve_whitelist(self, name: str) -> MaybeDependency | None: - # TODO attach compatibility to dependency, see https://github.com/databrickslabs/ucx/issues/1382 - compatibility = self._whitelist.compatibility(name) - if compatibility == UCCompatibility.FULL: - return MaybeDependency(None, []) - if compatibility == UCCompatibility.NONE: - # TODO move to linter, see https://github.com/databrickslabs/ucx/issues/1527 - problem = DependencyProblem("dependency-check", f"Use of dependency {name} is deprecated") - return MaybeDependency(None, [problem]) - if compatibility == UCCompatibility.PARTIAL: - problem = DependencyProblem("dependency-check", f"Package {name} is only partially supported by UC") - return MaybeDependency(None, [problem]) - return None - - def _resolve_import(self, path_lookup: PathLookup, name: str) -> MaybeDependency | None: if not name: return MaybeDependency(None, [DependencyProblem("ucx-bug", "Import name is empty")]) parts = [] @@ -295,11 +148,7 @@ def _resolve_import(self, path_lookup: PathLookup, name: str) -> MaybeDependency continue dependency = Dependency(self._file_loader, absolute_path) return MaybeDependency(dependency, []) - return None - - @staticmethod - def _fail(code: str, message: str): - return MaybeDependency(None, [DependencyProblem(code, message)]) + return super().resolve_import(path_lookup, name) def __repr__(self): - return "ImportFileResolver()" + return "LocalFileResolver()" diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py index 9d19909186..f061329f51 100644 --- a/src/databricks/labs/ucx/source_code/graph.py +++ b/src/databricks/labs/ucx/source_code/graph.py @@ -5,7 +5,6 @@ from dataclasses import dataclass from pathlib import Path from collections.abc import Callable -from typing import cast from databricks.labs.ucx.source_code.base import Advisory from databricks.labs.ucx.source_code.python_linter import ( @@ -162,8 +161,7 @@ def build_graph_from_python_source(self, python_code: str) -> list[DependencyPro linter = ASTLinter.parse(python_code) syspath_changes = PythonLinter.list_sys_path_changes(linter) run_calls = PythonLinter.list_dbutils_notebook_run_calls(linter) - import_sources, import_problems = PythonLinter.list_import_sources(linter, DependencyProblem) - problems.extend(cast(list[DependencyProblem], import_problems)) + import_sources = PythonLinter.list_import_sources(linter) nodes = syspath_changes + run_calls + import_sources # need to execute things in intertwined sequence so concat and sort for base_node in sorted(nodes, key=lambda node: node.node.lineno * 10000 + node.node.col_offset): @@ -279,9 +277,21 @@ def _fail(code: str, message: str) -> MaybeDependency: class BaseImportResolver(abc.ABC): + def __init__(self, next_resolver: BaseImportResolver | None): + self._next_resolver = next_resolver + @abc.abstractmethod + def with_next_resolver(self, resolver: BaseImportResolver) -> BaseImportResolver: + """required to create a linked list of resolvers""" + + @property + def next_resolver(self): + return self._next_resolver + def resolve_import(self, path_lookup: PathLookup, name: str) -> MaybeDependency: - """resolve a simple or composite import name""" + # TODO: remove StubResolver and return MaybeDependency(None, [...]) + assert self._next_resolver is not None + return self._next_resolver.resolve_import(path_lookup, name) class BaseFileResolver(abc.ABC): @@ -307,6 +317,22 @@ def _fail(code: str, message: str): return MaybeDependency(None, [DependencyProblem(code, message)]) +class StubImportResolver(BaseImportResolver): + + def __init__(self): + super().__init__(None) + + def with_next_resolver(self, resolver: BaseImportResolver) -> BaseImportResolver: + raise NotImplementedError("Should never happen!") + + def resolve_import(self, path_lookup: PathLookup, name: str) -> MaybeDependency: + return self._fail('import-not-found', f"Could not locate import: {name}") + + @staticmethod + def _fail(code: str, message: str): + return MaybeDependency(None, [DependencyProblem(code, message)]) + + @dataclass class MaybeDependency: dependency: Dependency | None @@ -325,12 +351,12 @@ def __init__( self, library_resolvers: list[BaseLibraryResolver], notebook_resolver: BaseNotebookResolver, - import_resolver: BaseImportResolver, + import_resolvers: list[BaseImportResolver], path_lookup: PathLookup, ): self._library_resolver = self._chain_library_resolvers(library_resolvers) self._notebook_resolver = notebook_resolver - self._import_resolver = import_resolver + self._import_resolver = self._chain_import_resolvers(import_resolvers) self._path_lookup = path_lookup @staticmethod @@ -341,6 +367,14 @@ def _chain_library_resolvers(library_resolvers: list[BaseLibraryResolver]) -> Ba previous = resolver return previous + @staticmethod + def _chain_import_resolvers(import_resolvers: list[BaseImportResolver]) -> BaseImportResolver: + previous: BaseImportResolver = StubImportResolver() + for resolver in import_resolvers: + resolver = resolver.with_next_resolver(previous) + previous = resolver + return previous + def resolve_notebook(self, path_lookup: PathLookup, path: Path) -> MaybeDependency: return self._notebook_resolver.resolve_notebook(path_lookup, path) @@ -372,8 +406,11 @@ def build_local_file_dependency_graph(self, path: Path) -> MaybeGraph: @property def _local_file_resolver(self) -> BaseFileResolver | None: - if isinstance(self._import_resolver, BaseFileResolver): - return self._import_resolver + resolver = self._import_resolver + while resolver is not None: + if isinstance(resolver, BaseFileResolver): + return resolver + resolver = resolver.next_resolver return None def build_notebook_dependency_graph(self, path: Path) -> MaybeGraph: diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index 57a7acdf9a..385b15a180 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -9,7 +9,6 @@ from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound from databricks.sdk.service import compute, jobs -from databricks.sdk.service.workspace import ExportFormat from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex from databricks.labs.ucx.mixins.wspath import WorkspacePath @@ -43,10 +42,6 @@ class JobProblem: end_line: int end_col: int - def as_message(self) -> str: - message = f"{self.path}:{self.start_line} [{self.code}] {self.message}" - return message - class WorkflowTask(Dependency): def __init__(self, ws: WorkspaceClient, task: jobs.Task, job: jobs.Job): @@ -87,7 +82,8 @@ def _register_libraries(self, graph: DependencyGraph) -> Iterable[DependencyProb for library in self._task.libraries: yield from self._register_library(graph, library) - def _register_library(self, graph: DependencyGraph, library: compute.Library) -> Iterable[DependencyProblem]: + @staticmethod + def _register_library(graph: DependencyGraph, library: compute.Library) -> Iterable[DependencyProblem]: if library.pypi: problems = graph.register_library(library.pypi.package) if problems: @@ -99,19 +95,10 @@ def _register_library(self, graph: DependencyGraph, library: compute.Library) -> # TODO: download the wheel somewhere local and add it to "virtual sys.path" via graph.path_lookup.push_path # TODO: https://github.com/databrickslabs/ucx/issues/1640 yield DependencyProblem("not-yet-implemented", "Wheel library is not yet implemented") - if library.requirements: # https://pip.pypa.io/en/stable/reference/requirements-file-format/ - logger.info(f"Registering libraries from {library.requirements}") - with self._ws.workspace.download(library.requirements, format=ExportFormat.AUTO) as remote_file: - contents = remote_file.read().decode() - for requirement in contents.splitlines(): - clean_requirement = requirement.replace(" ", "") # requirements.txt may contain spaces - if clean_requirement.startswith("-r"): - logger.warning(f"References to other requirements file is not supported: {requirement}") - continue - if clean_requirement.startswith("-c"): - logger.warning(f"References to constrains file is not supported: {requirement}") - continue - yield from graph.register_library(clean_requirement) + if library.requirements: + # TODO: download and add every entry via graph.register_library + # TODO: https://github.com/databrickslabs/ucx/issues/1644 + yield DependencyProblem('not-yet-implemented', 'Requirements library is not yet implemented') if library.jar: # TODO: https://github.com/databrickslabs/ucx/issues/1641 yield DependencyProblem('not-yet-implemented', 'Jar library is not yet implemented') @@ -120,7 +107,7 @@ def _register_notebook(self, graph: DependencyGraph) -> Iterable[DependencyProbl if not self._task.notebook_task: return [] notebook_path = self._task.notebook_task.notebook_path - logger.info(f'Discovering {self._task.task_key} entrypoint: {notebook_path}') + logger.info(f'Disovering {self._task.task_key} entrypoint: {notebook_path}') path = WorkspacePath(self._ws, notebook_path) return graph.register_notebook(path) @@ -192,16 +179,11 @@ def refresh_report(self, sql_backend: SqlBackend, inventory_database: str): def lint_job(self, job_id: int) -> list[JobProblem]: try: job = self._ws.jobs.get(job_id) + return list(self._lint_job(job)) except NotFound: logger.warning(f'Could not find job: {job_id}') return [] - problems = self._lint_job(job) - if len(problems) > 0: - problem_messages = "\n".join([problem.as_message() for problem in problems]) - logger.warning(f"Found job problems:\n{problem_messages}") - return problems - _UNKNOWN = Path('') def _lint_job(self, job: jobs.Job) -> list[JobProblem]: diff --git a/src/databricks/labs/ucx/source_code/notebooks/cells.py b/src/databricks/labs/ucx/source_code/notebooks/cells.py index a5c4665fef..47a8b75759 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/cells.py +++ b/src/databricks/labs/ucx/source_code/notebooks/cells.py @@ -1,7 +1,6 @@ from __future__ import annotations import logging -import re from abc import ABC, abstractmethod from ast import parse as parse_python from enum import Enum @@ -185,7 +184,7 @@ def is_runnable(self) -> bool: def build_dependency_graph(self, graph: DependencyGraph) -> list[DependencyProblem]: # TODO: this is very basic code, we need to improve it - splits = re.split(r" |\n", self.original_code) + splits = self.original_code.split(' ') if len(splits) < 3: return [DependencyProblem("library-install-failed", f"Missing arguments in '{self.original_code}'")] if splits[1] != "install": diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index a807f9d5e0..941bfbe449 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -104,73 +104,29 @@ def name() -> str: return "notebook-linter" -SUPPORTED_EXTENSION_LANGUAGES = { - '.py': Language.PYTHON, - '.sql': Language.SQL, -} - - class FileLinter: - _NOT_YET_SUPPORTED_SUFFIXES = { - '.scala', - '.sh', - '.r', - } - _IGNORED_SUFFIXES = { - '.json', - '.md', - '.txt', - '.xml', - '.yml', - '.toml', - '.cfg', - '.bmp', - '.gif', - '.png', - '.tif', - '.tiff', - '.svg', - '.jpg', - '.jpeg', - '.pyc', - '.whl', - '.egg', - '.class', - '.iml', - '.gz', - } - _IGNORED_NAMES = { - '.ds_store', - '.gitignore', - '.coverage', - 'license', - 'codeowners', - 'makefile', - 'pkg-info', - 'metadata', - 'wheel', - 'record', - 'notice', - 'zip-safe', + _EXT = { + '.py': Language.PYTHON, + '.sql': Language.SQL, } - def __init__(self, langs: Languages, path: Path, content: str | None = None): + def __init__(self, langs: Languages, path: Path): self._languages: Languages = langs self._path: Path = path - self._content = content @cached_property - def _source_code(self) -> str: - return self._path.read_text() if self._content is None else self._content + def _content(self) -> str: + return self._path.read_text() def _file_language(self): - return SUPPORTED_EXTENSION_LANGUAGES.get(self._path.suffix.lower()) + return self._EXT.get(self._path.suffix) def _is_notebook(self): language = self._file_language() if not language: return False - return self._source_code.startswith(CellLanguage.of_language(language).file_magic_header) + cell_language = CellLanguage.of_language(language) + return self._content.startswith(cell_language.file_magic_header) def lint(self) -> Iterable[Advice]: if self._is_notebook(): @@ -181,23 +137,14 @@ def lint(self) -> Iterable[Advice]: def _lint_file(self): language = self._file_language() if not language: - suffix = self._path.suffix.lower() - if suffix in self._IGNORED_SUFFIXES or self._path.name.lower() in self._IGNORED_NAMES: - yield from [] - elif suffix in self._NOT_YET_SUPPORTED_SUFFIXES: - yield Failure("unsupported-language", f"Language not supported yet for {self._path}", 0, 0, 1, 1) - else: - yield Failure("unknown-language", f"Cannot detect language for {self._path}", 0, 0, 1, 1) - else: - try: - linter = self._languages.linter(language) - yield from linter.lint(self._source_code) - except ValueError as err: - yield Failure( - "unsupported-content", f"Error while parsing content of {self._path.as_posix()}: {err}", 0, 0, 1, 1 - ) + yield Failure("unsupported-language", f"Cannot detect language for {self._path}", 0, 0, 1, 1) + try: + linter = self._languages.linter(language) + yield from linter.lint(self._content) + except ValueError as err: + yield Failure("unsupported-language", str(err), 0, 0, 1, 1) def _lint_notebook(self): - notebook = Notebook.parse(self._path, self._source_code, self._file_language()) + notebook = Notebook.parse(self._path, self._content, self._file_language()) notebook_linter = NotebookLinter(self._languages, notebook) yield from notebook_linter.lint() diff --git a/src/databricks/labs/ucx/source_code/pyspark.py b/src/databricks/labs/ucx/source_code/pyspark.py index 76b40e6fd8..5707b87ae7 100644 --- a/src/databricks/labs/ucx/source_code/pyspark.py +++ b/src/databricks/labs/ucx/source_code/pyspark.py @@ -42,7 +42,8 @@ def apply(self, from_table: FromTable, index: MigrationIndex, node: ast.Call) -> def _get_table_arg(self, node: ast.Call): if len(node.args) > 0: return node.args[self.table_arg_index] if self.min_args <= len(node.args) <= self.max_args else None - if not self.table_arg_name or not node.keywords: + assert self.table_arg_name is not None + if not node.keywords: return None arg = next(kw for kw in node.keywords if kw.arg == self.table_arg_name) return arg.value if arg is not None else None diff --git a/src/databricks/labs/ucx/source_code/python_libraries.py b/src/databricks/labs/ucx/source_code/python_libraries.py index d56eb30dd5..230daad9ed 100644 --- a/src/databricks/labs/ucx/source_code/python_libraries.py +++ b/src/databricks/labs/ucx/source_code/python_libraries.py @@ -57,15 +57,13 @@ def _locate_library(self, path_lookup: PathLookup, library: Path) -> Path | None return None def _locate_dist_info(self, library_path: Path, library: Path) -> Path | None: - if not library_path.parent.exists(): - return None - dist_info_dir = None - for package in library_path.parent.iterdir(): - if package.name.startswith(library.name) and package.name.endswith(".dist-info"): - dist_info_dir = package - break # TODO: Matching multiple .dist-info's, https://github.com/databrickslabs/ucx/issues/1751 + packages = os.listdir(library_path.parent.as_posix()) + dist_info_dir = next( + (package for package in packages if package.startswith(library.name) and package.endswith(".dist-info")), + None, + ) if dist_info_dir is not None: - return dist_info_dir + return Path(library_path.parent, Path(dist_info_dir)) # maybe the name needs tweaking if "-" in library.name: name = library.name.replace("-", "_") diff --git a/src/databricks/labs/ucx/source_code/python_linter.py b/src/databricks/labs/ucx/source_code/python_linter.py index c7ce6a1c6f..50e73d3988 100644 --- a/src/databricks/labs/ucx/source_code/python_linter.py +++ b/src/databricks/labs/ucx/source_code/python_linter.py @@ -3,7 +3,7 @@ import abc import ast import logging -from collections.abc import Iterable, Callable +from collections.abc import Iterable from typing import TypeVar, Generic, cast from databricks.labs.ucx.source_code.base import Linter, Advice, Advisory @@ -270,9 +270,6 @@ def get_constant_path(self) -> str | None: return None -P = TypeVar("P", bound=Callable) - - class PythonLinter(Linter): def lint(self, code: str) -> Iterable[Advice]: @@ -315,33 +312,21 @@ def list_dbutils_notebook_run_calls(linter: ASTLinter) -> list[NotebookRunCall]: return [NotebookRunCall(call) for call in calls] @staticmethod - def list_import_sources(linter: ASTLinter, problem_type: P) -> tuple[list[ImportSource], list[P]]: - problems: list[P] = [] + def list_import_sources(linter: ASTLinter) -> list[ImportSource]: + # TODO: make this code more robust, because it fails detecting imports on UCX codebase try: # pylint: disable=too-many-try-statements nodes = linter.locate(ast.Import, []) sources = [ImportSource(node, alias.name) for node in nodes for alias in node.names] nodes = linter.locate(ast.ImportFrom, []) sources.extend(ImportSource(node, node.module) for node in nodes) nodes = linter.locate(ast.Call, [("import_module", ast.Attribute), ("importlib", ast.Name)]) - nodes.extend(linter.locate(ast.Call, [("__import__", ast.Attribute), ("importlib", ast.Name)])) - for node in nodes: - if isinstance(node.args[0], ast.Constant): - sources.append(ImportSource(node, node.args[0].value)) - continue - problem = problem_type( - 'dependency-not-constant', - "Can't check dependency not provided as a constant", - start_line=node.lineno, - start_col=node.col_offset, - end_line=node.end_lineno or 0, - end_col=node.end_col_offset or 0, - ) - problems.append(problem) - return sources, problems + sources.extend(ImportSource(node, node.args[0].value) for node in nodes) + nodes = linter.locate(ast.Call, [("__import__", ast.Attribute), ("importlib", ast.Name)]) + sources.extend(ImportSource(node, node.args[0].value) for node in nodes) + return sources except Exception as e: # pylint: disable=broad-except - problem = problem_type('internal-error', f"While linter {linter} was checking imports: {e}") - problems.append(problem) - return [], problems + logger.warning(f"{linter} imports: {e}") + return [] @staticmethod def list_sys_path_changes(linter: ASTLinter) -> list[SysPathChange]: diff --git a/src/databricks/labs/ucx/source_code/whitelist.py b/src/databricks/labs/ucx/source_code/whitelist.py index 1eaa249f62..07c793f4a0 100644 --- a/src/databricks/labs/ucx/source_code/whitelist.py +++ b/src/databricks/labs/ucx/source_code/whitelist.py @@ -9,16 +9,43 @@ from yaml import load_all as load_yaml, Loader +from databricks.labs.ucx.source_code.path_lookup import PathLookup from databricks.labs.ucx.source_code.graph import ( + BaseImportResolver, DependencyGraph, DependencyProblem, + MaybeDependency, SourceContainer, ) logger = logging.getLogger(__name__) +class WhitelistResolver(BaseImportResolver): + + def __init__(self, whitelist: Whitelist, next_resolver: BaseImportResolver | None = None): + super().__init__(next_resolver) + self._whitelist = whitelist + + def with_next_resolver(self, resolver: BaseImportResolver) -> BaseImportResolver: + return WhitelistResolver(self._whitelist, resolver) + + def resolve_import(self, path_lookup: PathLookup, name: str) -> MaybeDependency: + # TODO attach compatibility to dependency, see https://github.com/databrickslabs/ucx/issues/1382 + compatibility = self._whitelist.compatibility(name) + if compatibility == UCCompatibility.FULL: + return MaybeDependency(None, []) + if compatibility == UCCompatibility.NONE: + # TODO move to linter, see https://github.com/databrickslabs/ucx/issues/1527 + problem = DependencyProblem("dependency-check", f"Use of dependency {name} is deprecated") + return MaybeDependency(None, [problem]) + if compatibility == UCCompatibility.PARTIAL: + problem = DependencyProblem("dependency-check", f"Package {name} is only partially supported by UC") + return MaybeDependency(None, [problem]) + return super().resolve_import(path_lookup, name) + + class StubContainer(SourceContainer): def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: diff --git a/src/databricks/labs/ucx/source_code/workflows.py b/src/databricks/labs/ucx/source_code/workflows.py index d58f2978aa..c3775dfb1c 100644 --- a/src/databricks/labs/ucx/source_code/workflows.py +++ b/src/databricks/labs/ucx/source_code/workflows.py @@ -11,8 +11,3 @@ def lint_all_workflows(self, ctx: RuntimeContext): """[EXPERIMENTAL] Analyses all jobs for source code compatibility problems. This is an experimental feature, that is not yet fully supported.""" ctx.workflow_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) - - @job_task(dashboard="migration_main", depends_on=[lint_all_workflows]) - def migration_report(self, ctx: RuntimeContext): - """Refreshes the migration dashboard after all previous tasks have been completed. Note that you can access the - dashboard _before_ all tasks have been completed, but then only already completed information is shown.""" diff --git a/src/databricks/labs/ucx/workspace_access/clusters.py b/src/databricks/labs/ucx/workspace_access/clusters.py index 0172eb0b97..2289ad6c52 100644 --- a/src/databricks/labs/ucx/workspace_access/clusters.py +++ b/src/databricks/labs/ucx/workspace_access/clusters.py @@ -61,6 +61,7 @@ def map_cluster_to_uc(self, cluster_id: str, cluster_details: list[ClusterDetail aws_attributes=cluster.aws_attributes, ssh_public_keys=cluster.ssh_public_keys, enable_elastic_disk=cluster.enable_elastic_disk, + cluster_source=cluster.cluster_source, instance_pool_id=cluster.instance_pool_id, enable_local_disk_encryption=cluster.enable_local_disk_encryption, driver_instance_pool_id=cluster.driver_instance_pool_id, @@ -104,6 +105,7 @@ def revert_cluster_remap(self, cluster_ids: str, total_cluster_ids: list): aws_attributes=cluster_details.aws_attributes, ssh_public_keys=cluster_details.ssh_public_keys, enable_elastic_disk=cluster_details.enable_elastic_disk, + cluster_source=cluster_details.cluster_source, instance_pool_id=cluster_details.instance_pool_id, enable_local_disk_encryption=cluster_details.enable_local_disk_encryption, driver_instance_pool_id=cluster_details.driver_instance_pool_id, diff --git a/tests/integration/assessment/test_jobs.py b/tests/integration/assessment/test_jobs.py index e5e52998d3..d5ba36507c 100644 --- a/tests/integration/assessment/test_jobs.py +++ b/tests/integration/assessment/test_jobs.py @@ -63,3 +63,10 @@ def test_job_run_crawler(ws, env_or_skip, inventory_schema, sql_backend): failures = job_run.failures continue assert failures and failures == "[]" + + +def test_removeafter_tag(ws, env_or_skip, make_job): + new_job = make_job(spark_conf=_SPARK_CONF) + + created_job = ws.jobs.get(new_job.job_id) + assert "RemoveAfter" in created_job.settings.tags diff --git a/tests/integration/aws/test_access.py b/tests/integration/aws/test_access.py index cd5622d74d..c092b1b186 100644 --- a/tests/integration/aws/test_access.py +++ b/tests/integration/aws/test_access.py @@ -76,10 +76,9 @@ def test_create_uber_instance_profile( ): profile = env_or_skip("AWS_DEFAULT_PROFILE") aws = AWSResources(profile) + account_id = aws.validate_connection().get("Account") sql_backend.save_table( - f"{inventory_schema}.external_locations", - [ExternalLocation("s3://bucket1/FOLDER1", 1)], - ExternalLocation, + f"{inventory_schema}.external_locations", [ExternalLocation("s3://bucket1/FOLDER1", 1)], ExternalLocation ) installation = Installation(ws, make_random(4)) # create a new policy @@ -92,6 +91,7 @@ def test_create_uber_instance_profile( aws, ExternalLocations(ws, sql_backend, inventory_schema), aws_cli_ctx.principal_acl, + account_id, ) aws_permissions.create_uber_principal( MockPrompts( diff --git a/tests/integration/azure/test_credentials.py b/tests/integration/azure/test_credentials.py index 36e6c26c0e..317368590a 100644 --- a/tests/integration/azure/test_credentials.py +++ b/tests/integration/azure/test_credentials.py @@ -192,7 +192,8 @@ def test_spn_migration_access_connector_created( # Mocking in an integration test because Azure resource can not be created resource_permissions = create_autospec(AzureResourcePermissions) - access_connector_id = AzureResource(env_or_skip("TEST_ACCESS_CONNECTOR")) + # TODO: Remove the replace after 20-05-2024 + access_connector_id = AzureResource(env_or_skip("TEST_ACCESS_CONNECTOR").replace("-external", "")) access_connector = AccessConnector( id=access_connector_id, name=f"test-{make_random()}", diff --git a/tests/integration/azure/test_locations.py b/tests/integration/azure/test_locations.py index 60d6de47c1..e01871910c 100644 --- a/tests/integration/azure/test_locations.py +++ b/tests/integration/azure/test_locations.py @@ -257,6 +257,7 @@ def test_run_validate_acl(make_cluster_permissions, ws, make_user, make_cluster, ) +@pytest.mark.skip("Waiting for change to use access connector in integration tests") def test_run_external_locations_using_access_connector( clean_storage_credentials, clean_external_locations, @@ -269,7 +270,8 @@ def test_run_external_locations_using_access_connector( # Mocking in an integration test because Azure resource can not be created resource_permissions = create_autospec(AzureResourcePermissions) - access_connector_id = AzureResource(env_or_skip("TEST_ACCESS_CONNECTOR")) + # TODO: Remove the replace after 20-05-2024 + access_connector_id = AzureResource(env_or_skip("TEST_ACCESS_CONNECTOR").replace("-external", "")) mount = env_or_skip("TEST_MOUNT_CONTAINER") storage_account_name = urlparse(mount).hostname.removesuffix(".dfs.core.windows.net") access_connector = AccessConnector( @@ -306,7 +308,7 @@ def test_run_external_locations_using_access_connector( ) az_cli_ctx.service_principal_migration.run(prompts) # Create storage credential for above access connector - az_cli_ctx.external_locations_migration.run() # Create external location using storage credential + az_cli_ctx.azure_external_locations_migration.run() # Create external location using storage credential all_external_locations = az_cli_ctx.workspace_client.external_locations.list() assert len([loc for loc in all_external_locations if loc.url == external_location.location]) > 0 diff --git a/tests/integration/framework/test_fixtures.py b/tests/integration/framework/test_fixtures.py index d9f110cd8a..79ae4dbac7 100644 --- a/tests/integration/framework/test_fixtures.py +++ b/tests/integration/framework/test_fixtures.py @@ -1,5 +1,5 @@ import logging -from datetime import timedelta, datetime + import pytest # pylint: disable-next=import-private-name @@ -10,9 +10,6 @@ from databricks.labs.ucx.mixins.fixtures import * # noqa: F403 logger = logging.getLogger(__name__) -_SPARK_CONF = { - "spark.databricks.cluster.profile": "singleNode", -} @pytest.fixture # type: ignore[no-redef] @@ -105,57 +102,3 @@ def test_table_fixture(make_table): def test_dbfs_fixture(make_mounted_location): logger.info(f"Created new dbfs data copy:{make_mounted_location}") - - -def test_remove_after_tag_jobs(ws, env_or_skip, make_job): - new_job = make_job(spark_conf=_SPARK_CONF) - created_job = ws.jobs.get(new_job.job_id) - assert "RemoveAfter" in created_job.settings.tags - - purge_time = datetime.strptime(created_job.settings.tags.get("RemoveAfter"), "%Y%m%d%H") - assert purge_time - datetime.utcnow() < timedelta(hours=1, minutes=15) - - -def test_remove_after_tag_clusters(ws, env_or_skip, make_cluster): - new_cluster = make_cluster(single_node=True, instance_pool_id=env_or_skip('TEST_INSTANCE_POOL_ID')) - created_cluster = ws.clusters.get(new_cluster.cluster_id) - assert "RemoveAfter" in created_cluster.custom_tags - purge_time = datetime.strptime(created_cluster.custom_tags.get("RemoveAfter"), "%Y%m%d%H") - assert purge_time - datetime.utcnow() < timedelta(hours=1, minutes=15) - - -def test_remove_after_tag_warehouse(ws, env_or_skip, make_warehouse): - new_warehouse = make_warehouse() - created_warehouse = ws.warehouses.get(new_warehouse.response.id) - custom_tags = created_warehouse.tags.as_dict() - assert 'RemoveAfter' in custom_tags.get("custom_tags")[0]["key"] - purge_time = datetime.strptime(custom_tags.get("custom_tags")[0]["value"], "%Y%m%d%H") - assert purge_time - datetime.utcnow() < timedelta(hours=1, minutes=15) - - -def test_remove_after_tag_instance_pool(ws, make_instance_pool): - new_instance_pool = make_instance_pool() - created_instance_pool = ws.instance_pools.get(new_instance_pool.instance_pool_id) - assert "RemoveAfter" in created_instance_pool.custom_tags - purge_time = datetime.strptime(created_instance_pool.custom_tags.get("RemoveAfter"), "%Y%m%d%H") - assert purge_time - datetime.utcnow() < timedelta(hours=1, minutes=15) - - -def test_remove_after_property_table(ws, make_table, sql_backend): - new_table = make_table() - # TODO: tables.get is currently failing with - # databricks.sdk.errors.platform.NotFound: Catalog 'hive_metastore' does not exist. - sql_response = list(sql_backend.fetch(f"DESCRIBE TABLE EXTENDED {new_table.full_name}")) - for row in sql_response: - if row.col_name == "Table Properties": - assert "RemoveAfter" in row[1] - - -def test_remove_after_property_schema(ws, make_schema, sql_backend): - new_schema = make_schema() - # TODO: schemas.get is currently failing with - # databricks.sdk.errors.platform.NotFound: Catalog 'hive_metastore' does not exist. - sql_response = list(sql_backend.fetch(f"DESCRIBE SCHEMA EXTENDED {new_schema.full_name}")) - for row in sql_response: - if row.database_description_item == "Properties": - assert "RemoveAfter" in row[1] diff --git a/tests/integration/hive_metastore/test_migrate.py b/tests/integration/hive_metastore/test_migrate.py index a87bba3890..e8375ce08c 100644 --- a/tests/integration/hive_metastore/test_migrate.py +++ b/tests/integration/hive_metastore/test_migrate.py @@ -8,7 +8,6 @@ from databricks.sdk.service.catalog import Privilege, SecurableType, TableInfo, TableType from databricks.sdk.service.iam import PermissionLevel -from databricks.labs.ucx.config import WorkspaceConfig from databricks.labs.ucx.hive_metastore.mapping import Rule, TableMapping from databricks.labs.ucx.hive_metastore.tables import AclMigrationWhat, Table, What @@ -564,61 +563,6 @@ def test_migrate_external_tables_with_principal_acl_aws( assert match -def test_migrate_table_in_mount( - ws, - sql_backend, - inventory_schema, - make_catalog, - make_schema, - env_or_skip, - make_random, - runtime_ctx, - make_acc_group, -): - if not ws.config.is_azure: - pytest.skip("temporary: only works in azure test env") - config = WorkspaceConfig( - warehouse_id=env_or_skip("TEST_DEFAULT_WAREHOUSE_ID"), - inventory_database=inventory_schema, - connect=ws.config, - ) - runtime_ctx = runtime_ctx.replace(config=config) - tbl_path = make_random(4).lower() - src_external_table = runtime_ctx.make_table( - schema_name=make_schema(catalog_name="hive_metastore", name=f'mounted_{env_or_skip("TEST_MOUNT_NAME")}').name, - external_delta=f'dbfs:/mnt/{env_or_skip("TEST_MOUNT_NAME")}/a/b/{tbl_path}', - ) - table_in_mount_location = f"abfss://things@labsazurethings.dfs.core.windows.net/a/b/{tbl_path}" - # TODO: Remove this hack below - # This is done because we have to create the external table in a mount point, but TablesInMounts() has to use the adls/ path - # Otherwise, if we keep the dbfs:/ path, the entire logic of TablesInMounts won't work - src_external_table.storage_location = table_in_mount_location - runtime_ctx.save_tables() - dst_catalog = make_catalog() - dst_schema = make_schema(catalog_name=dst_catalog.name) - - runtime_ctx.with_table_mapping_rules( - [ - Rule( - "workspace", - dst_catalog.name, - f'mounted_{env_or_skip("TEST_MOUNT_NAME")}', - dst_schema.name, - table_in_mount_location, - src_external_table.name, - ), - ] - ) - runtime_ctx.with_dummy_resource_permission() - - runtime_ctx.tables_migrator.migrate_tables(what=What.TABLE_IN_MOUNT) - - target_tables = list(sql_backend.fetch(f"SHOW TABLES IN {dst_schema.full_name}")) - assert len(target_tables) == 1 - target_table_properties = ws.tables.get(f"{dst_schema.full_name}.{src_external_table.name}") - assert target_table_properties.properties["upgraded_from"] == table_in_mount_location - - def test_migrate_external_tables_with_spn_azure( ws, make_user, prepared_principal_acl, make_cluster_permissions, make_cluster ): diff --git a/tests/integration/hive_metastore/test_workflows.py b/tests/integration/hive_metastore/test_workflows.py index 6eb9b8ffd4..1c2699b7e5 100644 --- a/tests/integration/hive_metastore/test_workflows.py +++ b/tests/integration/hive_metastore/test_workflows.py @@ -31,9 +31,7 @@ def test_table_migration_job_refreshes_migration_status(ws, installation_ctx, pr migration_status_query = f"SELECT * FROM {ctx.config.inventory_database}.migration_status" migration_statuses = list(ctx.sql_backend.fetch(migration_status_query)) - if len(migration_statuses) == 0: - ctx.deployed_workflows.relay_logs(workflow) - assert False, "No migration statuses found" + assert len(migration_statuses) > 0, "No migration statuses found" asserts = [] for table in tables.values(): diff --git a/tests/integration/recon/__init__.py b/tests/integration/recon/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tests/integration/recon/test_migrate_recon.py b/tests/integration/recon/test_migrate_recon.py deleted file mode 100644 index 8cab34b699..0000000000 --- a/tests/integration/recon/test_migrate_recon.py +++ /dev/null @@ -1,27 +0,0 @@ -from datetime import timedelta - -from databricks.sdk.errors import NotFound -from databricks.sdk.retries import retried - -from databricks.labs.ucx.hive_metastore.mapping import Rule -from databricks.labs.ucx.hive_metastore.tables import What - - -@retried(on=[NotFound], timeout=timedelta(minutes=2)) -def test_migrate_managed_tables_and_recon(ws, sql_backend, runtime_ctx, make_catalog): - src_schema = runtime_ctx.make_schema(catalog_name="hive_metastore") - src_managed_table = runtime_ctx.make_table(catalog_name=src_schema.catalog_name, schema_name=src_schema.name) - - dst_catalog = make_catalog() - dst_schema = runtime_ctx.make_schema(catalog_name=dst_catalog.name, name=src_schema.name) - - rules = [Rule.from_src_dst(src_managed_table, dst_schema)] - - runtime_ctx.with_table_mapping_rules(rules) - - runtime_ctx.with_dummy_resource_permission() - runtime_ctx.tables_migrator.migrate_tables(what=What.DBFS_ROOT_DELTA) - runtime_ctx.tables_migrator.index() - - recon_result = runtime_ctx.migration_recon.snapshot() - assert len(list(recon_result)) == 1, "Expected 1 recon result" diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py index c19b192df2..dd9dfa5c65 100644 --- a/tests/integration/source_code/test_jobs.py +++ b/tests/integration/source_code/test_jobs.py @@ -1,19 +1,9 @@ -import io -import logging -from dataclasses import replace -from io import StringIO from pathlib import Path import pytest from databricks.sdk.service import compute -from databricks.sdk.service.workspace import ImportFormat -from databricks.labs.blueprint.tui import Prompts - -from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex from databricks.labs.ucx.mixins.wspath import WorkspacePath -from databricks.labs.ucx.source_code.files import LocalCodeLinter -from databricks.labs.ucx.source_code.languages import Languages from databricks.labs.ucx.source_code.path_lookup import PathLookup from databricks.labs.ucx.source_code.whitelist import Whitelist @@ -25,9 +15,7 @@ def test_running_real_workflow_linter_job(installation_ctx): ctx.deployed_workflows.validate_step("experimental-workflow-linter") cursor = ctx.sql_backend.fetch(f"SELECT COUNT(*) AS count FROM {ctx.inventory_database}.workflow_problems") result = next(cursor) - if result['count'] == 0: - ctx.deployed_workflows.relay_logs("experimental-workflow-linter") - assert False, "No workflow problems found" + assert result['count'] > 0 @pytest.fixture @@ -59,14 +47,7 @@ def test_job_linter_no_problems(simple_ctx, ws, make_job): assert len(problems) == 0 -def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job, make_notebook, make_random, caplog): - expected_messages = { - "second_notebook:4 [direct-filesystem-access] The use of default dbfs: references is deprecated: /mnt/something", - "some_file.py:1 [direct-filesystem-access] The use of default dbfs: references is deprecated: /mnt/foo/bar", - "some_file.py:1 [dbfs-usage] Deprecated file system path in call to: /mnt/foo/bar", - "second_notebook:4 [dbfs-usage] Deprecated file system path in call to: /mnt/something", - } - +def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job, make_notebook, make_random): entrypoint = WorkspacePath(ws, f"~/linter-{make_random(4)}").expanduser() entrypoint.mkdir() @@ -85,14 +66,15 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job, some_file = entrypoint / 'some_file.py' some_file.write_text('display(spark.read.parquet("/mnt/foo/bar"))') - with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.source_code.jobs"): - problems = simple_ctx.workflow_linter.lint_job(j.job_id) - - messages = {replace(p, path=Path(p.path).relative_to(entrypoint)).as_message() for p in problems} - assert messages == expected_messages + problems = simple_ctx.workflow_linter.lint_job(j.job_id) - last_messages = caplog.messages[-1].split("\n") - assert all(any(message.endswith(expected) for message in last_messages) for expected in expected_messages) + messages = {f'{Path(p.path).relative_to(entrypoint)}:{p.start_line} [{p.code}] {p.message}' for p in problems} + assert messages == { + 'second_notebook:4 [direct-filesystem-access] The use of default dbfs: references is deprecated: /mnt/something', + 'some_file.py:1 [direct-filesystem-access] The use of default dbfs: references is deprecated: /mnt/foo/bar', + 'some_file.py:1 [dbfs-usage] Deprecated file system path in call to: /mnt/foo/bar', + 'second_notebook:4 [dbfs-usage] Deprecated file system path in call to: /mnt/something', + } def test_workflow_linter_lints_job_with_import_pypi_library( @@ -124,49 +106,3 @@ def test_workflow_linter_lints_job_with_import_pypi_library( problems = simple_ctx.workflow_linter.lint_job(job_with_pytest_library.job_id) assert len([problem for problem in problems if problem.message == "Could not locate import: pytest"]) == 0 - - -def test_lint_local_code(simple_ctx): - # no need to connect - light_ctx = simple_ctx.replace(languages=Languages(MigrationIndex([]))) - ucx_path = Path(__file__).parent.parent.parent.parent - path_to_scan = Path(ucx_path, "src") - linter = LocalCodeLinter( - light_ctx.file_loader, - light_ctx.folder_loader, - light_ctx.path_lookup, - light_ctx.dependency_resolver, - lambda: light_ctx.languages, - ) - problems = linter.lint(Prompts(), path_to_scan, StringIO()) - assert len(problems) > 0 - - -def test_workflow_linter_lints_job_with_requirements_dependency( - simple_ctx, - ws, - make_job, - make_notebook, - make_random, - make_directory, - tmp_path, -): - expected_problem_message = "Could not locate import: yaml" - - simple_ctx = simple_ctx.replace( - whitelist=Whitelist(use_defaults=False), # yaml is in default list - path_lookup=PathLookup(Path("/non/existing/path"), []), # Avoid finding the yaml locally - ) - - entrypoint = make_directory() - - requirements_file = f"{entrypoint}/requirements.txt" - ws.workspace.upload(requirements_file, io.BytesIO(b"pyyaml"), format=ImportFormat.AUTO) - library = compute.Library(requirements=requirements_file) - - notebook = make_notebook(path=f"{entrypoint}/notebook.ipynb", content=b"import yaml") - job_with_pytest_library = make_job(notebook_path=notebook, libraries=[library]) - - problems = simple_ctx.workflow_linter.lint_job(job_with_pytest_library.job_id) - - assert len([problem for problem in problems if problem.message == expected_problem_message]) == 0 diff --git a/tests/integration/test_installation.py b/tests/integration/test_installation.py index c14fdaa21d..a8abc3c358 100644 --- a/tests/integration/test_installation.py +++ b/tests/integration/test_installation.py @@ -115,16 +115,8 @@ def test_experimental_permissions_migration_for_group_with_same_name( ) new_schema_grants = installation_ctx.grants_crawler.for_schema_info(schema_a) - if {"USAGE", "OWN"} != new_schema_grants[migrated_group.name_in_account] or object_permissions[ - migrated_group.name_in_account - ] != PermissionLevel.CAN_USE: - installation_ctx.deployed_workflows.relay_logs("migrate-groups-experimental") - assert {"USAGE", "OWN"} == new_schema_grants[ - migrated_group.name_in_account - ], "Incorrect schema grants for migrated group" - assert ( - object_permissions[migrated_group.name_in_account] == PermissionLevel.CAN_USE - ), "Incorrect permissions for migrated group" + assert {"USAGE", "OWN"} == new_schema_grants[migrated_group.name_in_account] + assert object_permissions[migrated_group.name_in_account] == PermissionLevel.CAN_USE @retried(on=[NotFound, TimeoutError], timeout=timedelta(minutes=3)) diff --git a/tests/unit/aws/test_access.py b/tests/unit/aws/test_access.py index 291f260f94..468ea0f964 100644 --- a/tests/unit/aws/test_access.py +++ b/tests/unit/aws/test_access.py @@ -199,7 +199,6 @@ def test_create_uber_principal_existing_role_in_policy(mock_ws, mock_installatio ) mock_ws.cluster_policies.get.return_value = cluster_policy aws = create_autospec(AWSResources) - aws.validate_connection.return_value = {} aws.get_instance_profile_arn.return_value = instance_profile_arn locations = ExternalLocations(mock_ws, backend, "ucx") prompts = MockPrompts({"We have identified existing UCX migration role *": "yes"}) @@ -311,11 +310,10 @@ def test_create_uber_principal_no_storage(mock_ws, mock_installation, locations) def test_create_uc_role_single(mock_ws, installation_single_role, backend, locations): aws = create_autospec(AWSResources) - aws.validate_connection.return_value = {} aws_resource_permissions = AWSResourcePermissions(installation_single_role, mock_ws, aws, locations) role_creation = IamRoleCreation(installation_single_role, mock_ws, aws_resource_permissions) aws.list_all_uc_roles.return_value = [] - role_creation.run(MockPrompts({"Above *": "yes"}), single_role=True) + role_creation.run(MockPrompts({"Above *": "yes"})) assert aws.create_uc_role.assert_called assert ( call('UC_ROLE', 'UC_POLICY', {'s3://BUCKET1/FOLDER1', 's3://BUCKET2/FOLDER2'}, None, None) @@ -325,7 +323,6 @@ def test_create_uc_role_single(mock_ws, installation_single_role, backend, locat def test_create_uc_role_multiple(mock_ws, installation_single_role, backend, locations): aws = create_autospec(AWSResources) - aws.validate_connection.return_value = {} aws_resource_permissions = AWSResourcePermissions(installation_single_role, mock_ws, aws, locations) role_creation = IamRoleCreation(installation_single_role, mock_ws, aws_resource_permissions) aws.list_all_uc_roles.return_value = [] diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 803624626c..880f624ee5 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -94,6 +94,8 @@ def change_directory(self, new_working_directory: Path) -> 'MockPathLookup': def resolve(self, path: Path) -> Path | None: candidates = [path] + if not path.name.endswith('.txt'): + candidates.append(Path(f"{path}.txt")) for candidate in candidates: absolute_path = super().resolve(candidate) if not absolute_path: diff --git a/tests/unit/contexts/test_application.py b/tests/unit/contexts/test_application.py index 23396fefc4..d57455328c 100644 --- a/tests/unit/contexts/test_application.py +++ b/tests/unit/contexts/test_application.py @@ -1,34 +1,12 @@ -from unittest.mock import create_autospec - import pytest from databricks.labs.ucx.contexts.application import GlobalContext -from databricks.labs.ucx.contexts.workspace_cli import LocalCheckoutContext -from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex -from databricks.labs.ucx.hive_metastore.table_migrate import TablesMigrator -from databricks.labs.ucx.source_code.languages import Languages -from tests.unit import mock_workspace_client -@pytest.mark.parametrize( - "attribute", ["dependency_resolver", "pip_resolver", "site_packages_path", "notebook_loader", "folder_loader"] -) +@pytest.mark.parametrize("attribute", ["dependency_resolver", "pip_resolver", "site_packages_path"]) def test_global_context_attributes_not_none(attribute: str): """Attributes should be not None""" # Goal is to improve test coverage ctx = GlobalContext() assert hasattr(ctx, attribute) assert getattr(ctx, attribute) is not None - - -@pytest.mark.parametrize("attribute", ["local_file_migrator", "local_code_linter"]) -def test_local_context_attributes_not_none(attribute: str): - """Attributes should be not None""" - # Goal is to improve test coverage - client = mock_workspace_client() - ctx = LocalCheckoutContext(client) - tables_migrator = create_autospec(TablesMigrator) - tables_migrator.index.return_value = None - ctx.replace(languages=Languages(MigrationIndex([])), tables_migrator=tables_migrator) - assert hasattr(ctx, attribute) - assert getattr(ctx, attribute) is not None diff --git a/tests/unit/hive_metastore/test_catalog_schema.py b/tests/unit/hive_metastore/test_catalog_schema.py index 4f7bd9f571..90d3af8a95 100644 --- a/tests/unit/hive_metastore/test_catalog_schema.py +++ b/tests/unit/hive_metastore/test_catalog_schema.py @@ -70,14 +70,6 @@ def prepare_test(ws, backend: MockBackend | None = None) -> CatalogSchema: 'src_table': 'abfss://container@msft/path/dest3', 'workspace_name': 'workspace', }, - { - 'catalog_name': 'catalog4', - 'dst_schema': 'schema4', - 'dst_table': 'table1', - 'src_schema': 'schema1', - 'src_table': 'abfss://container@msft/path/dest4', - 'workspace_name': 'workspace', - }, ] } ) @@ -94,9 +86,9 @@ def prepare_test(ws, backend: MockBackend | None = None) -> CatalogSchema: return CatalogSchema(ws, table_mapping, principal_acl, backend) -@pytest.mark.parametrize("location", ["s3://foo/bar", "s3://foo/bar/test", "s3://foo/bar/test/baz"]) +@pytest.mark.parametrize("location", ["s3://foo/bar", "s3://foo/bar/test"]) def test_create_all_catalogs_schemas_creates_catalogs(location: str): - """Catalog 2-4 should be created; catalog 1 already exists.""" + """Catalog 2 and 3 should be created; catalog 1 already exists.""" ws = create_autospec(WorkspaceClient) mock_prompts = MockPrompts({"Please provide storage location url for catalog: *": location}) @@ -106,7 +98,6 @@ def test_create_all_catalogs_schemas_creates_catalogs(location: str): calls = [ call("catalog2", storage_root=location, comment="Created by UCX"), call("catalog3", storage_root=location, comment="Created by UCX"), - call("catalog4", storage_root=location, comment="Created by UCX"), ] ws.catalogs.create.assert_has_calls(calls, any_order=True) diff --git a/tests/unit/hive_metastore/test_locations.py b/tests/unit/hive_metastore/test_locations.py index 767951eda3..081fe2e49b 100644 --- a/tests/unit/hive_metastore/test_locations.py +++ b/tests/unit/hive_metastore/test_locations.py @@ -164,8 +164,6 @@ def test_save_external_location_mapping_missing_location(): ("s3://test_location/test1/table1", ""), ("gcs://test_location2/test2/table2", ""), ("abfss://cont1@storagetest1.dfs.core.windows.net/test2/table3", ""), - ("s3a://test_location_2/test1/table1", ""), - ("s3n://test_location_3/test1/table1", ""), ], } ) @@ -192,16 +190,6 @@ def test_save_external_location_mapping_missing_location(): ' name = "cont1_storagetest1_test2"\n' ' url = "abfss://cont1@storagetest1.dfs.core.windows.net/test2"\n' " credential_name = databricks_storage_credential..id\n" - "}\n\n" - 'resource "databricks_external_location" "test_location_2_test1" { \n' - ' name = "test_location_2_test1"\n' - ' url = "s3a://test_location_2/test1"\n' - " credential_name = databricks_storage_credential..id\n" - "}\n\n" - 'resource "databricks_external_location" "test_location_3_test1" { \n' - ' name = "test_location_3_test1"\n' - ' url = "s3n://test_location_3/test1"\n' - " credential_name = databricks_storage_credential..id\n" "}\n" ).encode("utf8"), ) diff --git a/tests/unit/hive_metastore/test_mapping.py b/tests/unit/hive_metastore/test_mapping.py index 38e27338b0..e454baf9f2 100644 --- a/tests/unit/hive_metastore/test_mapping.py +++ b/tests/unit/hive_metastore/test_mapping.py @@ -10,12 +10,8 @@ from databricks.sdk.errors.platform import ResourceConflict from databricks.sdk.service.catalog import TableInfo -from databricks.labs.ucx.hive_metastore.mapping import ( - Rule, - TableMapping, - TableToMigrate, -) from databricks.labs.ucx.account.workspaces import WorkspaceInfo +from databricks.labs.ucx.hive_metastore.mapping import Rule, TableMapping from databricks.labs.ucx.hive_metastore.tables import Table, TablesCrawler MANAGED_DELTA_TABLE = Table( @@ -644,143 +640,6 @@ def test_is_target_exists(): assert table_mapping.exists_in_uc(src_table, "cat1.schema2.dest2") -def test_tables_in_mounts(): - client = create_autospec(WorkspaceClient) - client.workspace.download.return_value = io.BytesIO( - "workspace_name,catalog_name,src_schema,dst_schema,src_table,dst_table\r\n" - "fake_ws,cat1,schema1,schema1,table1,dest1\r\n".encode("utf8") - ) - errors = {} - rows = {} - backend = MockBackend(fails_on_first=errors, rows=rows) - installation = Installation(client, "ucx") - table_mapping = TableMapping(installation, client, backend) - tables_crawler = create_autospec(TablesCrawler) - tables_crawler.snapshot.return_value = [] - table_mapping.get_tables_to_migrate(tables_crawler) - - assert ["DESCRIBE SCHEMA EXTENDED schema1"] == backend.queries - - -def test_mapping_table_in_mount(): - client = create_autospec(WorkspaceClient) - client.workspace.download.return_value = io.BytesIO( - "workspace_name,catalog_name,src_schema,dst_schema,src_table,dst_table\r\n" - "prod_ws,cat1,mounted_dlk,schema1,abfss://bucket@msft/path/dest1,dest1\r\n".encode("utf8") - ) - errors = {} - rows = {} - backend = MockBackend(fails_on_first=errors, rows=rows) - client.tables.get.side_effect = NotFound() - client.catalogs.list.return_value = [] - client.schemas.list.return_value = [] - client.tables.list.return_value = [] - - installation = Installation(client, "ucx") - table_mapping = TableMapping(installation, client, backend) - tables_crawler = create_autospec(TablesCrawler) - src_table = Table( - object_type="EXTERNAL", - table_format="DELTA", - catalog="hive_metastore", - database="mounted_dlk", - name="table1", - location="abfss://bucket@msft/path/dest1", - ) - tables_crawler.snapshot.return_value = [ - src_table, - ] - tables = table_mapping.get_tables_to_migrate(tables_crawler) - assert "SHOW TBLPROPERTIES delta.`abfss://bucket@msft/path/dest1`" in backend.queries - assert tables == [ - TableToMigrate( - src=src_table, - rule=Rule( - workspace_name='prod_ws', - catalog_name='cat1', - src_schema='mounted_dlk', - dst_schema='schema1', - src_table='abfss://bucket@msft/path/dest1', - dst_table='dest1', - ), - ) - ] - - -def test_mapping_table_in_mount_exists_in_uc_with_properties(): - client = create_autospec(WorkspaceClient) - client.workspace.download.return_value = io.BytesIO( - "workspace_name,catalog_name,src_schema,dst_schema,src_table,dst_table\r\n" - "prod_ws,cat1,mounted_dlk,schema1,abfss://bucket@msft/path/dest1,dest1\r\n".encode("utf8") - ) - errors = {} - rows = {} - backend = MockBackend(fails_on_first=errors, rows=rows) - client.tables.get.side_effect = [ - TableInfo( - catalog_name="cat1", - schema_name="schema1", - name="dest1", - full_name="cat1.schema1.test_table1", - storage_location="abfss://bucket@msft/path/dest1", - ), - ] - - installation = Installation(client, "ucx") - table_mapping = TableMapping(installation, client, backend) - tables_crawler = create_autospec(TablesCrawler) - src_table = Table( - object_type="EXTERNAL", - table_format="DELTA", - catalog="hive_metastore", - database="mounted_dlk", - name="table1", - location="abfss://bucket@msft/path/dest1", - ) - tables_crawler.snapshot.return_value = [src_table] - tables = table_mapping.get_tables_to_migrate(tables_crawler) - assert not tables - - -def test_mapping_table_in_mount_exists_in_uc_with_bad_properties(): - client = create_autospec(WorkspaceClient) - client.workspace.download.return_value = io.BytesIO( - "workspace_name,catalog_name,src_schema,dst_schema,src_table,dst_table\r\n" - "prod_ws,cat1,mounted_dlk,schema1,abfss://bucket@msft/path/dest1,dest1\r\n".encode("utf8") - ) - errors = {} - rows = {} - backend = MockBackend(fails_on_first=errors, rows=rows) - client.tables.get.side_effect = [ - TableInfo( - catalog_name="cat1", - schema_name="schema1", - name="dest1", - full_name="cat1.schema1.test_table1", - storage_location="abfss://bucket@msft/path/dest34", - ), - ] - - installation = Installation(client, "ucx") - table_mapping = TableMapping(installation, client, backend) - tables_crawler = create_autospec(TablesCrawler) - src_table = Table( - object_type="EXTERNAL", - table_format="DELTA", - catalog="hive_metastore", - database="mounted_dlk", - name="table1", - location="abfss://bucket@msft/path/dest1", - ) - tables_crawler.snapshot.return_value = [src_table] - with pytest.raises(ResourceConflict): - try: - table_mapping.get_tables_to_migrate(tables_crawler) - except ManyError as e: - assert len(e.errs) == 1 - raise e.errs[0] - - def test_mapping_broken_table(caplog): client = create_autospec(WorkspaceClient) tables_crawler = create_autospec(TablesCrawler) diff --git a/tests/unit/hive_metastore/test_table_migrate.py b/tests/unit/hive_metastore/test_table_migrate.py index 0a9a38c474..e046f24948 100644 --- a/tests/unit/hive_metastore/test_table_migrate.py +++ b/tests/unit/hive_metastore/test_table_migrate.py @@ -2,6 +2,7 @@ import logging from itertools import cycle from unittest.mock import create_autospec + import pytest from databricks.labs.lsql.backends import MockBackend, SqlBackend from databricks.sdk import WorkspaceClient @@ -35,7 +36,6 @@ from .. import GROUPS, mock_table_mapping, mock_workspace_client - logger = logging.getLogger(__name__) @@ -1167,99 +1167,6 @@ def test_migrate_views_should_be_properly_sequenced(ws): assert next((key for key in table_keys if key == "hive_metastore.db1_src.t1_src"), None) is None -def test_table_in_mount_mapping_with_table_owner(): - client = create_autospec(WorkspaceClient) - client.tables.get.side_effect = NotFound() - backend = MockBackend( - rows={ - 'hive_metastore.test.tables': [ - ("hms", "mounted_datalake", "name", "object_type", "table_format", "abfss://bucket@msft/path/test") - ], - 'DESCRIBE TABLE delta.`abfss://bucket@msft/path/test`;': [ - ("col1", "string", None), - ("col2", "decimal", None), - ], - } - ) - table_mapping = create_autospec(TableMapping) - table_mapping.get_tables_to_migrate.return_value = [ - TableToMigrate( - Table("hive_metastore", "mounted_datalake", "test", "EXTERNAL", "DELTA", "abfss://bucket@msft/path/test"), - Rule("prod", "tgt_catalog", "mounted_datalake", "tgt_db", "abfss://bucket@msft/path/test", "test"), - ) - ] - table_crawler = TablesCrawler(backend, "inventory_database") - udf_crawler = UdfsCrawler(backend, "inventory_database") - grant_crawler = GrantsCrawler(table_crawler, udf_crawler) - group_manager = GroupManager(backend, client, "inventory_database") - migration_status_refresher = MigrationStatusRefresher(client, backend, "inventory_database", table_crawler) - principal_grants = create_autospec(PrincipalACL) - table_migrate = TablesMigrator( - table_crawler, - grant_crawler, - client, - backend, - table_mapping, - group_manager, - migration_status_refresher, - principal_grants, - ) - table_migrate.migrate_tables(what=What.TABLE_IN_MOUNT) - assert ( - "CREATE TABLE IF NOT EXISTS tgt_catalog.tgt_db.test (col1 string, col2 decimal) LOCATION 'abfss://bucket@msft/path/test';" - in backend.queries - ) - principal_grants.get_interactive_cluster_grants.assert_not_called() - - -def test_table_in_mount_mapping_with_partition_information(): - client = create_autospec(WorkspaceClient) - client.tables.get.side_effect = NotFound() - backend = MockBackend( - rows={ - 'hive_metastore.test.tables': [ - ("hms", "mounted_datalake", "name", "object_type", "table_format", "abfss://bucket@msft/path/test") - ], - 'DESCRIBE TABLE delta.`abfss://bucket@msft/path/test`;': [ - ("col1", "string", None), - ("col2", "decimal", None), - ("# Partition Information", None, None), - ("# col_name", None, None), - ("col1", None, None), - ], - } - ) - table_mapping = create_autospec(TableMapping) - table_mapping.get_tables_to_migrate.return_value = [ - TableToMigrate( - Table("hive_metastore", "mounted_datalake", "test", "EXTERNAL", "DELTA", "abfss://bucket@msft/path/test"), - Rule("prod", "tgt_catalog", "mounted_datalake", "tgt_db", "abfss://bucket@msft/path/test", "test"), - ) - ] - table_crawler = TablesCrawler(backend, "inventory_database") - udf_crawler = UdfsCrawler(backend, "inventory_database") - grant_crawler = GrantsCrawler(table_crawler, udf_crawler) - group_manager = GroupManager(backend, client, "inventory_database") - migration_status_refresher = MigrationStatusRefresher(client, backend, "inventory_database", table_crawler) - principal_grants = create_autospec(PrincipalACL) - table_migrate = TablesMigrator( - table_crawler, - grant_crawler, - client, - backend, - table_mapping, - group_manager, - migration_status_refresher, - principal_grants, - ) - table_migrate.migrate_tables(what=What.TABLE_IN_MOUNT) - assert ( - "CREATE TABLE IF NOT EXISTS tgt_catalog.tgt_db.test (col1 string, col2 decimal) PARTITIONED BY (col1) LOCATION 'abfss://bucket@msft/path/test';" - in backend.queries - ) - principal_grants.get_interactive_cluster_grants.assert_not_called() - - def test_migrate_view_failed(ws, caplog): errors = {"CREATE OR REPLACE VIEW": "error"} create = "CREATE OR REPLACE VIEW hive_metastore.db1_src.view_src (a,b) AS SELECT * FROM db1_src.managed_dbfs" diff --git a/tests/unit/hive_metastore/test_table_size.py b/tests/unit/hive_metastore/test_table_size.py index 0684c21f3f..77599c2b85 100644 --- a/tests/unit/hive_metastore/test_table_size.py +++ b/tests/unit/hive_metastore/test_table_size.py @@ -34,8 +34,6 @@ def test_table_size_crawler(mocker): tsc = TableSizeCrawler(backend, "inventory_database") tsc._spark._jsparkSession.table().queryExecution().analyzed().stats().sizeInBytes.side_effect = [100, 200, 300] results = tsc.snapshot() - assert "ANALYZE table hive_metastore.db1.table1 compute STATISTICS NOSCAN" in backend.queries - assert "ANALYZE table hive_metastore.db1.table2 compute STATISTICS NOSCAN" in backend.queries assert len(results) == 2 assert TableSize("hive_metastore", "db1", "table1", 100) in results assert TableSize("hive_metastore", "db1", "table2", 200) in results diff --git a/tests/unit/hive_metastore/test_workflows.py b/tests/unit/hive_metastore/test_workflows.py index 2cab1fd86c..3e0083fe5f 100644 --- a/tests/unit/hive_metastore/test_workflows.py +++ b/tests/unit/hive_metastore/test_workflows.py @@ -5,7 +5,6 @@ MigrateExternalTablesCTAS, MigrateHiveSerdeTablesInPlace, MigrateTablesInMounts, - ScanTablesInMounts, ) @@ -55,14 +54,7 @@ def test_migrate_ctas_views(run_workflow): @pytest.mark.parametrize( - "workflow", - [ - TableMigration, - MigrateHiveSerdeTablesInPlace, - MigrateExternalTablesCTAS, - ScanTablesInMounts, - MigrateTablesInMounts, - ], + "workflow", [TableMigration, MigrateHiveSerdeTablesInPlace, MigrateExternalTablesCTAS, MigrateTablesInMounts] ) def test_refresh_migration_status_is_refreshed(run_workflow, workflow): """Migration status is refreshed by deleting and showing new tables""" diff --git a/tests/unit/installer/test_policy.py b/tests/unit/installer/test_policy.py index 980ec4f34d..768bad4112 100644 --- a/tests/unit/installer/test_policy.py +++ b/tests/unit/installer/test_policy.py @@ -108,7 +108,6 @@ def test_cluster_policy_definition_aws_glue(): "spark_conf.spark.databricks.hive.metastore.glueCatalog.enabled": {"type": "fixed", "value": "true"}, "aws_attributes.instance_profile_arn": {"type": "fixed", "value": "role_arn_1"}, "aws_attributes.availability": {"type": "fixed", "value": "ON_DEMAND"}, - "aws_attributes.zone_id": {"type": "fixed", "value": "auto"}, } assert policy_id == "foo1" assert instance_profile == "role_arn_1" @@ -287,7 +286,6 @@ def test_cluster_policy_definition_aws_glue_warehouse(): "spark_conf.spark.databricks.hive.metastore.glueCatalog.enabled": {"type": "fixed", "value": "true"}, "aws_attributes.instance_profile_arn": {"type": "fixed", "value": "role_arn_1"}, "aws_attributes.availability": {"type": "fixed", "value": "ON_DEMAND"}, - "aws_attributes.zone_id": {"type": "fixed", "value": "auto"}, } assert policy_id == "foo1" assert instance_profile == "role_arn_1" @@ -382,7 +380,6 @@ def test_cluster_policy_definition_empty_config(): "spark_version": {"type": "fixed", "value": "14.2.x-scala2.12"}, "node_type_id": {"type": "fixed", "value": "Standard_F4s"}, "aws_attributes.availability": {"type": "fixed", "value": "ON_DEMAND"}, - "aws_attributes.zone_id": {"type": "fixed", "value": "auto"}, } assert policy_id == "foo1" @@ -425,7 +422,6 @@ def test_cluster_policy_instance_pool(): "spark_version": {"type": "fixed", "value": "14.2.x-scala2.12"}, "node_type_id": {"type": "fixed", "value": "Standard_F4s"}, "aws_attributes.availability": {"type": "fixed", "value": "ON_DEMAND"}, - "aws_attributes.zone_id": {"type": "fixed", "value": "auto"}, } _, _, _, instance_pool_id = policy_installer.create('ucx') assert instance_pool_id is None diff --git a/tests/unit/recon/conftest.py b/tests/unit/recon/conftest.py index 0d0f469db8..1cb5fb8314 100644 --- a/tests/unit/recon/conftest.py +++ b/tests/unit/recon/conftest.py @@ -21,6 +21,6 @@ def row_count_row_factory(): def data_comp_row_factory(): yield MockBackend.rows( "total_mismatches", - "target_missing_count", - "source_missing_count", + "num_missing_records_in_target", + "num_missing_records_in_source", ) diff --git a/tests/unit/recon/test_data_comparator.py b/tests/unit/recon/test_data_comparator.py index e47588b9fa..d27484cd9b 100644 --- a/tests/unit/recon/test_data_comparator.py +++ b/tests/unit/recon/test_data_comparator.py @@ -36,13 +36,13 @@ def test_data_comparison(metadata_row_factory, row_count_row_factory, data_comp_ expected_comparison_result = DataComparisonResult( source_row_count=100, target_row_count=2, - source_missing_count=2, - target_missing_count=100, + num_missing_records_in_target=100, + num_missing_records_in_source=2, ) data_profiler = StandardDataProfiler(sql_backend, DatabricksTableMetadataRetriever(sql_backend)) data_comparator = StandardDataComparator(sql_backend, data_profiler) - actual_comparison_result = data_comparator.compare_data(source, target, True) + actual_comparison_result = data_comparator.compare_data(source, target) assert actual_comparison_result == expected_comparison_result diff --git a/tests/unit/recon/test_migration_recon.py b/tests/unit/recon/test_migration_recon.py deleted file mode 100644 index 9d0edcd9e2..0000000000 --- a/tests/unit/recon/test_migration_recon.py +++ /dev/null @@ -1,91 +0,0 @@ -from unittest.mock import create_autospec - -import pytest -from databricks.labs.lsql.backends import MockBackend -from databricks.sdk import WorkspaceClient - -from databricks.labs.ucx.hive_metastore import TablesCrawler -from databricks.labs.ucx.hive_metastore.migration_status import MigrationStatusRefresher -from databricks.labs.ucx.recon.base import TableIdentifier -from databricks.labs.ucx.recon.data_comparator import StandardDataComparator -from databricks.labs.ucx.recon.data_profiler import StandardDataProfiler -from databricks.labs.ucx.recon.metadata_retriever import DatabricksTableMetadataRetriever -from databricks.labs.ucx.recon.migration_recon import MigrationRecon -from databricks.labs.ucx.recon.schema_comparator import StandardSchemaComparator -from tests.unit import mock_table_mapping - - -@pytest.fixture -def ws(): - client = create_autospec(WorkspaceClient) - client.get_workspace_id.return_value = "12345" - return client - - -MIGRATION_STATUS = MockBackend.rows( - "src_schema", - "src_table", - "dst_catalog", - "dst_schema", - "dst_table", - "update_ts", -) -UPGRADED_TO = MockBackend.rows("key", "value") - - -def test_migrate_recon_should_produce_proper_queries( - ws, - metadata_row_factory, - row_count_row_factory, - data_comp_row_factory, -): - src = TableIdentifier("hive_metastore", "db1", "table1") - src_2 = TableIdentifier("hive_metastore", "db2", "table2") - src_3 = TableIdentifier("hive_metastore", "db3", "table3") - tgt = TableIdentifier("catalog1", "schema1", "table1") - tgt_2 = TableIdentifier("catalog2", "schema2", "table2") - tgt_3 = TableIdentifier("catalog3", "schema3", "table3") - errors = {} - rows = { - 'SELECT \\* FROM inventory_database.migration_status': MIGRATION_STATUS[ - (src.schema, src.table, tgt.catalog, tgt.schema, tgt.table, "2021-01-01T00:00:00Z"), - (src_2.schema, src_2.table, tgt_2.catalog, tgt_2.schema, tgt_2.table, "2021-01-01T00:00:00Z"), - (src_3.schema, src_3.table, tgt_3.catalog, tgt_3.schema, tgt_3.table, "2021-01-01T00:00:00Z"), - ("schema_none", "table_none", None, "schema_a", "table_a", "2021-01-01T00:00:00Z"), - ], - f"SHOW TBLPROPERTIES {src.schema}.{src.table} \\('upgraded_to'\\)": UPGRADED_TO[("value", "fake_dest"),], - f"SHOW TBLPROPERTIES {src_2.schema}.{src_2.table} \\('upgraded_to'\\)": UPGRADED_TO[("value", "fake_dest"),], - "DESCRIBE TABLE": metadata_row_factory[ - ("col1", "int"), - ("col2", "string"), - ], - f"{tgt.catalog_escaped}\\.information_schema\\.columns": metadata_row_factory[ - ("col1", "int"), - ("col2", "string"), - ], - f"{tgt_2.catalog_escaped}\\.information_schema\\.columns": metadata_row_factory[ - ("col1", "int"), - ("col2", "string"), - ], - f"SELECT COUNT\\(\\*\\) as row_count FROM {src.fqn_escaped}": row_count_row_factory[100,], - f"SELECT COUNT\\(\\*\\) as row_count FROM {tgt.fqn_escaped}": row_count_row_factory[2,], - f"SELECT COUNT\\(\\*\\) as row_count FROM {src_2.fqn_escaped}": row_count_row_factory[0,], - f"SELECT COUNT\\(\\*\\) as row_count FROM {tgt_2.fqn_escaped}": row_count_row_factory[0,], - "WITH compare_results": data_comp_row_factory[(102, 100, 2),], - } - backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(backend, "inventory_database") - migration_status_refresher = MigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) - metadata_retriever = DatabricksTableMetadataRetriever(backend) - data_profiler = StandardDataProfiler(backend, metadata_retriever) - migration_recon = MigrationRecon( - backend, - "inventory_database", - migration_status_refresher, - mock_table_mapping(["managed_dbfs", "managed_mnt", "managed_other"]), - StandardSchemaComparator(metadata_retriever), - StandardDataComparator(backend, data_profiler), - 0, - ) - results = list(migration_recon.snapshot()) - assert len(results) == 2 diff --git a/tests/unit/recon/test_workflow.py b/tests/unit/recon/test_workflow.py deleted file mode 100644 index 06ca926a3b..0000000000 --- a/tests/unit/recon/test_workflow.py +++ /dev/null @@ -1,6 +0,0 @@ -from databricks.labs.ucx.recon.workflows import MigrationRecon - - -def test_migration_recon_refresh(run_workflow): - ctx = run_workflow(MigrationRecon.recon_migration_result) - ctx.workspace_client.catalogs.list.assert_called_once() diff --git a/tests/unit/source_code/notebooks/test_cells.py b/tests/unit/source_code/notebooks/test_cells.py index 5e6fe5c438..ef8e581ad3 100644 --- a/tests/unit/source_code/notebooks/test_cells.py +++ b/tests/unit/source_code/notebooks/test_cells.py @@ -91,15 +91,3 @@ def test_pip_cell_build_dependency_graph_resolves_installed_library(mock_path_lo assert len(problems) == 0 assert graph.path_lookup.resolve(Path("pkgdir")).exists() - - -def test_pip_cell_build_dependency_graph_handles_multiline_code(): - graph = create_autospec(DependencyGraph) - - code = "%pip install databricks\nmore code defined" - cell = PipCell(code) - - problems = cell.build_dependency_graph(graph) - - assert len(problems) == 0 - graph.register_library.assert_called_once_with("databricks") diff --git a/tests/unit/source_code/notebooks/test_sources.py b/tests/unit/source_code/notebooks/test_sources.py deleted file mode 100644 index 5251f3b1dc..0000000000 --- a/tests/unit/source_code/notebooks/test_sources.py +++ /dev/null @@ -1,45 +0,0 @@ -from pathlib import Path - -import pytest - -from databricks.labs.ucx.source_code.languages import Languages -from databricks.labs.ucx.source_code.notebooks.sources import FileLinter - - -@pytest.mark.parametrize("path, content", [("xyz.py", "a = 3"), ("xyz.sql", "select * from dual")]) -def test_file_linter_lints_supported_language(path, content, migration_index): - linter = FileLinter(Languages(migration_index), Path(path), content) - advices = list(linter.lint()) - assert not advices - - -@pytest.mark.parametrize("path", ["xyz.scala", "xyz.r", "xyz.sh"]) -def test_file_linter_lints_not_yet_supported_language(path, migration_index): - linter = FileLinter(Languages(migration_index), Path(path), "") - advices = list(linter.lint()) - assert [advice.code for advice in advices] == ["unsupported-language"] - - -@pytest.mark.parametrize( - "path", - [ - "xyz.json", - "xyz.xml", - "xyz.yml", - "xyz.cfg", - "xyz.md", - "xyz.txt", - "xyz.gif", - "xyz.png", - "xyz.jpg", - "xyz.jpeg", - "xyz.tif", - "xyz.bmp", - "xyz.toml", - ".DS_Store", # on MacOS - ], -) -def test_file_linter_lints_ignorable_language(path, migration_index): - linter = FileLinter(Languages(migration_index), Path(path), "") - advices = list(linter.lint()) - assert not advices diff --git a/tests/unit/source_code/samples/00_var_context.py b/tests/unit/source_code/samples/00_var_context.py.txt similarity index 100% rename from tests/unit/source_code/samples/00_var_context.py rename to tests/unit/source_code/samples/00_var_context.py.txt diff --git a/tests/unit/source_code/samples/01_HL7Streaming.scala b/tests/unit/source_code/samples/01_HL7Streaming.scala.txt similarity index 100% rename from tests/unit/source_code/samples/01_HL7Streaming.scala rename to tests/unit/source_code/samples/01_HL7Streaming.scala.txt diff --git a/tests/unit/source_code/samples/01_var_market_etl.py b/tests/unit/source_code/samples/01_var_market_etl.py.txt similarity index 100% rename from tests/unit/source_code/samples/01_var_market_etl.py rename to tests/unit/source_code/samples/01_var_market_etl.py.txt diff --git a/tests/unit/source_code/samples/3_SparkR_Fine Grained Demand Forecasting.r b/tests/unit/source_code/samples/3_SparkR_Fine Grained Demand Forecasting.r.txt similarity index 100% rename from tests/unit/source_code/samples/3_SparkR_Fine Grained Demand Forecasting.r rename to tests/unit/source_code/samples/3_SparkR_Fine Grained Demand Forecasting.r.txt diff --git a/tests/unit/source_code/samples/builtins.py b/tests/unit/source_code/samples/builtins.py.txt similarity index 100% rename from tests/unit/source_code/samples/builtins.py rename to tests/unit/source_code/samples/builtins.py.txt diff --git a/tests/unit/source_code/samples/chf-pqi-scoring.sql b/tests/unit/source_code/samples/chf-pqi-scoring.sql.txt similarity index 100% rename from tests/unit/source_code/samples/chf-pqi-scoring.sql rename to tests/unit/source_code/samples/chf-pqi-scoring.sql.txt diff --git a/tests/unit/source_code/samples/cyclical2.run.py b/tests/unit/source_code/samples/cyclical1.run.py.txt similarity index 83% rename from tests/unit/source_code/samples/cyclical2.run.py rename to tests/unit/source_code/samples/cyclical1.run.py.txt index 58478444ef..3029131cc7 100644 --- a/tests/unit/source_code/samples/cyclical2.run.py +++ b/tests/unit/source_code/samples/cyclical1.run.py.txt @@ -13,4 +13,4 @@ # COMMAND ---------- -# MAGIC %run "./cyclical1.run.py" +# MAGIC %run "./cyclical2.run.py.txt" diff --git a/tests/unit/source_code/samples/root2.run.py b/tests/unit/source_code/samples/cyclical2.run.py.txt similarity index 83% rename from tests/unit/source_code/samples/root2.run.py rename to tests/unit/source_code/samples/cyclical2.run.py.txt index 7fbb4c90a7..50961f8ed9 100644 --- a/tests/unit/source_code/samples/root2.run.py +++ b/tests/unit/source_code/samples/cyclical2.run.py.txt @@ -13,4 +13,4 @@ # COMMAND ---------- -# MAGIC %run "./root1.run.py" +# MAGIC %run "./cyclical1.run.py.txt" diff --git a/tests/unit/source_code/samples/import-site-package.py b/tests/unit/source_code/samples/import-site-package.py.txt similarity index 100% rename from tests/unit/source_code/samples/import-site-package.py rename to tests/unit/source_code/samples/import-site-package.py.txt diff --git a/tests/unit/source_code/samples/import-sub-site-package.py b/tests/unit/source_code/samples/import-sub-site-package.py.txt similarity index 100% rename from tests/unit/source_code/samples/import-sub-site-package.py rename to tests/unit/source_code/samples/import-sub-site-package.py.txt diff --git a/tests/unit/source_code/samples/known.py b/tests/unit/source_code/samples/known.py.txt similarity index 100% rename from tests/unit/source_code/samples/known.py rename to tests/unit/source_code/samples/known.py.txt diff --git a/tests/unit/source_code/samples/leaf1.py b/tests/unit/source_code/samples/leaf1.py.txt similarity index 100% rename from tests/unit/source_code/samples/leaf1.py rename to tests/unit/source_code/samples/leaf1.py.txt diff --git a/tests/unit/source_code/samples/leaf2.py b/tests/unit/source_code/samples/leaf2.py.txt similarity index 100% rename from tests/unit/source_code/samples/leaf2.py rename to tests/unit/source_code/samples/leaf2.py.txt diff --git a/tests/unit/source_code/samples/leaf3.py b/tests/unit/source_code/samples/leaf3.py.txt similarity index 100% rename from tests/unit/source_code/samples/leaf3.py rename to tests/unit/source_code/samples/leaf3.py.txt diff --git a/tests/unit/source_code/samples/leaf4.py b/tests/unit/source_code/samples/leaf4.py.txt similarity index 100% rename from tests/unit/source_code/samples/leaf4.py rename to tests/unit/source_code/samples/leaf4.py.txt diff --git a/tests/unit/source_code/samples/leaf6.py b/tests/unit/source_code/samples/leaf6.py.txt similarity index 100% rename from tests/unit/source_code/samples/leaf6.py rename to tests/unit/source_code/samples/leaf6.py.txt diff --git a/tests/unit/source_code/samples/leaf9.py b/tests/unit/source_code/samples/leaf9.py.txt similarity index 100% rename from tests/unit/source_code/samples/leaf9.py rename to tests/unit/source_code/samples/leaf9.py.txt diff --git a/tests/unit/source_code/samples/notebook-with-pip-cell.py b/tests/unit/source_code/samples/notebook-with-pip-cell.py.txt similarity index 100% rename from tests/unit/source_code/samples/notebook-with-pip-cell.py rename to tests/unit/source_code/samples/notebook-with-pip-cell.py.txt diff --git a/tests/unit/source_code/samples/notebook-with-shell-cell.py b/tests/unit/source_code/samples/notebook-with-shell-cell.py.txt similarity index 100% rename from tests/unit/source_code/samples/notebook-with-shell-cell.py rename to tests/unit/source_code/samples/notebook-with-shell-cell.py.txt diff --git a/tests/unit/source_code/samples/python_builtins.py b/tests/unit/source_code/samples/python_builtins.py.txt similarity index 100% rename from tests/unit/source_code/samples/python_builtins.py rename to tests/unit/source_code/samples/python_builtins.py.txt diff --git a/tests/unit/source_code/samples/root1-no-leaf.run.py b/tests/unit/source_code/samples/root1-no-leaf.run.py.txt similarity index 89% rename from tests/unit/source_code/samples/root1-no-leaf.run.py rename to tests/unit/source_code/samples/root1-no-leaf.run.py.txt index c906fd6e94..022b6f5bb8 100644 --- a/tests/unit/source_code/samples/root1-no-leaf.run.py +++ b/tests/unit/source_code/samples/root1-no-leaf.run.py.txt @@ -13,7 +13,7 @@ # COMMAND ---------- -# MAGIC %run "./leaf1.py" +# MAGIC %run "./leaf1.py.txt" # COMMAND ---------- diff --git a/tests/unit/source_code/samples/root1.run.py b/tests/unit/source_code/samples/root1.run.py.txt similarity index 78% rename from tests/unit/source_code/samples/root1.run.py rename to tests/unit/source_code/samples/root1.run.py.txt index 20fb0d2b2d..25ee6cb400 100644 --- a/tests/unit/source_code/samples/root1.run.py +++ b/tests/unit/source_code/samples/root1.run.py.txt @@ -13,9 +13,9 @@ # COMMAND ---------- -# MAGIC %run "./leaf1.py" +# MAGIC %run "./leaf1.py.txt" # COMMAND ---------- -# MAGIC %run "./leaf2.py" +# MAGIC %run "./leaf2.py.txt" diff --git a/tests/unit/source_code/samples/root10.py b/tests/unit/source_code/samples/root10.py.txt similarity index 66% rename from tests/unit/source_code/samples/root10.py rename to tests/unit/source_code/samples/root10.py.txt index 421b9a878f..69f607de70 100644 --- a/tests/unit/source_code/samples/root10.py +++ b/tests/unit/source_code/samples/root10.py.txt @@ -1,3 +1,3 @@ # Databricks notebook source -some_notebook = "./leaf3.py" +some_notebook = "./leaf3.py.txt" dbutils.notebook.run(some_notebook) diff --git a/tests/unit/source_code/samples/cyclical1.run.py b/tests/unit/source_code/samples/root2.run.py.txt similarity index 84% rename from tests/unit/source_code/samples/cyclical1.run.py rename to tests/unit/source_code/samples/root2.run.py.txt index d9b4fbb5fb..639225eae7 100644 --- a/tests/unit/source_code/samples/cyclical1.run.py +++ b/tests/unit/source_code/samples/root2.run.py.txt @@ -13,4 +13,4 @@ # COMMAND ---------- -# MAGIC %run "./cyclical2.run.py" +# MAGIC %run "./root1.run.py.txt" diff --git a/tests/unit/source_code/samples/root3.run.py b/tests/unit/source_code/samples/root3.run.py.txt similarity index 75% rename from tests/unit/source_code/samples/root3.run.py rename to tests/unit/source_code/samples/root3.run.py.txt index ab30c588f9..022ad7a465 100644 --- a/tests/unit/source_code/samples/root3.run.py +++ b/tests/unit/source_code/samples/root3.run.py.txt @@ -13,8 +13,8 @@ # COMMAND ---------- -# MAGIC %run "./root1.run.py" +# MAGIC %run "./root1.run.py.txt" # COMMAND ---------- -# MAGIC %run "./root1.run.py" +# MAGIC %run "./root1.run.py.txt" diff --git a/tests/unit/source_code/samples/root4-no-leaf.py b/tests/unit/source_code/samples/root4-no-leaf.py.txt similarity index 100% rename from tests/unit/source_code/samples/root4-no-leaf.py rename to tests/unit/source_code/samples/root4-no-leaf.py.txt diff --git a/tests/unit/source_code/samples/root4.py b/tests/unit/source_code/samples/root4.py deleted file mode 100644 index 3288683699..0000000000 --- a/tests/unit/source_code/samples/root4.py +++ /dev/null @@ -1,2 +0,0 @@ -# Databricks notebook source -dbutils.notebook.run("./leaf3.py") diff --git a/tests/unit/source_code/samples/root4.py.txt b/tests/unit/source_code/samples/root4.py.txt new file mode 100644 index 0000000000..bb004e10b9 --- /dev/null +++ b/tests/unit/source_code/samples/root4.py.txt @@ -0,0 +1,2 @@ +# Databricks notebook source +dbutils.notebook.run("./leaf3.py.txt") diff --git a/tests/unit/source_code/samples/root5.py b/tests/unit/source_code/samples/root5.py.txt similarity index 100% rename from tests/unit/source_code/samples/root5.py rename to tests/unit/source_code/samples/root5.py.txt diff --git a/tests/unit/source_code/samples/root6.py b/tests/unit/source_code/samples/root6.py.txt similarity index 100% rename from tests/unit/source_code/samples/root6.py rename to tests/unit/source_code/samples/root6.py.txt diff --git a/tests/unit/source_code/samples/root7.py b/tests/unit/source_code/samples/root7.py.txt similarity index 100% rename from tests/unit/source_code/samples/root7.py rename to tests/unit/source_code/samples/root7.py.txt diff --git a/tests/unit/source_code/samples/root8.py b/tests/unit/source_code/samples/root8.py.txt similarity index 100% rename from tests/unit/source_code/samples/root8.py rename to tests/unit/source_code/samples/root8.py.txt diff --git a/tests/unit/source_code/samples/root9.py b/tests/unit/source_code/samples/root9.py.txt similarity index 100% rename from tests/unit/source_code/samples/root9.py rename to tests/unit/source_code/samples/root9.py.txt diff --git a/tests/unit/source_code/samples/run_notebooks.py b/tests/unit/source_code/samples/run_notebooks.py.txt similarity index 100% rename from tests/unit/source_code/samples/run_notebooks.py rename to tests/unit/source_code/samples/run_notebooks.py.txt diff --git a/tests/unit/source_code/test_dependencies.py b/tests/unit/source_code/test_dependencies.py index 02dd0ad1a9..0731e97649 100644 --- a/tests/unit/source_code/test_dependencies.py +++ b/tests/unit/source_code/test_dependencies.py @@ -10,10 +10,10 @@ NotebookResolver, NotebookLoader, ) -from databricks.labs.ucx.source_code.files import FileLoader, ImportFileResolver +from databricks.labs.ucx.source_code.files import FileLoader, LocalFileResolver from databricks.labs.ucx.source_code.path_lookup import PathLookup from databricks.labs.ucx.source_code.python_libraries import PipResolver -from databricks.labs.ucx.source_code.whitelist import Whitelist +from databricks.labs.ucx.source_code.whitelist import WhitelistResolver, Whitelist from tests.unit import ( locate_site_packages, _samples_path, @@ -33,36 +33,43 @@ def test_dependency_resolver_visits_workspace_notebook_dependencies(mock_path_lo dependency_resolver = DependencyResolver([], notebook_resolver, [], mock_path_lookup) maybe = dependency_resolver.build_notebook_dependency_graph(Path("root3.run.py")) assert not maybe.failed - assert maybe.graph.all_relative_names() == {"root3.run.py", "root1.run.py", "leaf1.py", "leaf2.py"} + assert maybe.graph.all_relative_names() == {"root3.run.py.txt", "root1.run.py.txt", "leaf1.py.txt", "leaf2.py.txt"} def test_dependency_resolver_visits_local_notebook_dependencies(mock_path_lookup): notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) dependency_resolver = DependencyResolver([], notebook_resolver, [], mock_path_lookup) - maybe = dependency_resolver.build_notebook_dependency_graph(Path("root4.py")) + maybe = dependency_resolver.build_notebook_dependency_graph(Path("root4.py.txt")) assert not maybe.failed - assert maybe.graph.all_relative_names() == {"root4.py", "leaf3.py"} + assert maybe.graph.all_relative_names() == {"root4.py.txt", "leaf3.py.txt"} def test_dependency_resolver_visits_workspace_file_dependencies(mock_path_lookup): - whitelist = Whitelist() + whi = Whitelist() file_loader = FileLoader() notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - import_resolver = ImportFileResolver(file_loader, whitelist) - dependency_resolver = DependencyResolver([], notebook_resolver, import_resolver, mock_path_lookup) + import_resolvers = [ + WhitelistResolver(whi), + LocalFileResolver(file_loader), + ] + dependency_resolver = DependencyResolver([], notebook_resolver, import_resolvers, mock_path_lookup) maybe = dependency_resolver.build_local_file_dependency_graph(Path('./root8.py')) assert not maybe.failed - assert maybe.graph.all_relative_names() == {'leaf1.py', 'leaf2.py', 'root8.py'} + assert maybe.graph.all_relative_names() == {'leaf1.py.txt', 'leaf2.py.txt', 'root8.py.txt'} def test_dependency_resolver_raises_problem_with_unfound_workspace_notebook_dependency(mock_path_lookup): + whi = Whitelist() file_loader = FileLoader() notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - import_resolver = ImportFileResolver(file_loader, Whitelist()) - dependency_resolver = DependencyResolver([], notebook_resolver, import_resolver, mock_path_lookup) + import_resolvers = [ + WhitelistResolver(whi), + LocalFileResolver(file_loader), + ] + dependency_resolver = DependencyResolver([], notebook_resolver, import_resolvers, mock_path_lookup) maybe = dependency_resolver.build_notebook_dependency_graph(Path("root1-no-leaf.run.py")) assert list(maybe.problems) == [ DependencyProblem( @@ -93,12 +100,12 @@ def test_dependency_resolver_raises_problem_with_non_constant_local_notebook_dep notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) dependency_resolver = DependencyResolver([], notebook_resolver, [], mock_path_lookup) - maybe = dependency_resolver.build_notebook_dependency_graph(Path('root10.py')) + maybe = dependency_resolver.build_notebook_dependency_graph(Path('root10.py.txt')) assert list(maybe.problems) == [ DependencyProblem( 'dependency-not-constant', "Can't check dependency not provided as a constant", - Path('root10.py'), + Path('root10.py.txt'), 2, 0, 2, @@ -111,44 +118,46 @@ def test_dependency_resolver_raises_problem_with_invalid_run_cell(mock_path_look notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) dependency_resolver = DependencyResolver([], notebook_resolver, [], mock_path_lookup) - maybe = dependency_resolver.build_notebook_dependency_graph(Path('leaf6.py')) + maybe = dependency_resolver.build_notebook_dependency_graph(Path('leaf6.py.txt')) assert list(maybe.problems) == [ - DependencyProblem('invalid-run-cell', 'Missing notebook path in %run command', Path('leaf6.py'), 5, 0, 5, 4) + DependencyProblem('invalid-run-cell', 'Missing notebook path in %run command', Path('leaf6.py.txt'), 5, 0, 5, 4) ] def test_dependency_resolver_visits_recursive_file_dependencies(mock_path_lookup): notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - import_resolver = ImportFileResolver(FileLoader(), Whitelist()) - dependency_resolver = DependencyResolver([], notebook_resolver, import_resolver, mock_path_lookup) - maybe = dependency_resolver.build_local_file_dependency_graph(Path("root6.py")) + import_resolvers = [LocalFileResolver(FileLoader())] + dependency_resolver = DependencyResolver([], notebook_resolver, import_resolvers, mock_path_lookup) + maybe = dependency_resolver.build_local_file_dependency_graph(Path("root6.py.txt")) assert not maybe.failed - assert maybe.graph.all_relative_names() == {"root6.py", "root5.py", "leaf4.py"} + assert maybe.graph.all_relative_names() == {"root6.py.txt", "root5.py.txt", "leaf4.py.txt"} def test_dependency_resolver_raises_problem_with_unresolved_import(mock_path_lookup): notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - import_resolver = ImportFileResolver(FileLoader(), Whitelist()) - dependency_resolver = DependencyResolver([], notebook_resolver, import_resolver, mock_path_lookup) - maybe = dependency_resolver.build_local_file_dependency_graph(Path('root7.py')) + import_resolvers = [LocalFileResolver(FileLoader())] + dependency_resolver = DependencyResolver([], notebook_resolver, import_resolvers, mock_path_lookup) + maybe = dependency_resolver.build_local_file_dependency_graph(Path('root7.py.txt')) assert list(maybe.problems) == [ - DependencyProblem('import-not-found', 'Could not locate import: some_library', Path("root7.py"), 1, 0, 1, 19) + DependencyProblem( + 'import-not-found', 'Could not locate import: some_library', Path("root7.py.txt"), 1, 0, 1, 19 + ) ] def test_dependency_resolver_raises_problem_with_non_constant_notebook_argument(mock_path_lookup): notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - import_resolver = ImportFileResolver(FileLoader(), Whitelist()) - dependency_resolver = DependencyResolver([], notebook_resolver, import_resolver, mock_path_lookup) - maybe = dependency_resolver.build_local_file_dependency_graph(Path("run_notebooks.py")) + import_resolvers = [WhitelistResolver(Whitelist()), LocalFileResolver(FileLoader())] + dependency_resolver = DependencyResolver([], notebook_resolver, import_resolvers, mock_path_lookup) + maybe = dependency_resolver.build_local_file_dependency_graph(Path("run_notebooks.py.txt")) assert list(maybe.problems) == [ DependencyProblem( 'dependency-not-constant', "Can't check dependency not provided as a constant", - Path("run_notebooks.py"), + Path("run_notebooks.py.txt"), 14, 13, 14, @@ -160,19 +169,19 @@ def test_dependency_resolver_raises_problem_with_non_constant_notebook_argument( def test_dependency_resolver_visits_file_dependencies(mock_path_lookup): notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - import_resolver = ImportFileResolver(FileLoader(), Whitelist()) - dependency_resolver = DependencyResolver([], notebook_resolver, import_resolver, mock_path_lookup) - maybe = dependency_resolver.build_local_file_dependency_graph(Path("root5.py")) + import_resolvers = [LocalFileResolver(FileLoader())] + dependency_resolver = DependencyResolver([], notebook_resolver, import_resolvers, mock_path_lookup) + maybe = dependency_resolver.build_local_file_dependency_graph(Path("root5.py.txt")) assert not maybe.failed - assert maybe.graph.all_relative_names() == {"root5.py", "leaf4.py"} + assert maybe.graph.all_relative_names() == {"root5.py.txt", "leaf4.py.txt"} def test_dependency_resolver_skips_builtin_dependencies(mock_path_lookup): notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - import_resolver = ImportFileResolver(FileLoader(), Whitelist()) - dependency_resolver = DependencyResolver([], notebook_resolver, import_resolver, mock_path_lookup) - maybe = dependency_resolver.build_local_file_dependency_graph(Path("python_builtins.py")) + import_resolvers = [WhitelistResolver(Whitelist()), LocalFileResolver(FileLoader())] + dependency_resolver = DependencyResolver([], notebook_resolver, import_resolvers, mock_path_lookup) + maybe = dependency_resolver.build_local_file_dependency_graph(Path("python_builtins.py.txt")) assert not maybe.failed graph = maybe.graph maybe = graph.locate_dependency(Path("os")) @@ -184,9 +193,9 @@ def test_dependency_resolver_skips_builtin_dependencies(mock_path_lookup): def test_dependency_resolver_ignores_known_dependencies(mock_path_lookup): notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - import_resolver = ImportFileResolver(FileLoader(), Whitelist()) - dependency_resolver = DependencyResolver([], notebook_resolver, import_resolver, mock_path_lookup) - maybe = dependency_resolver.build_local_file_dependency_graph(Path("python_builtins.py")) + import_resolvers = [WhitelistResolver(Whitelist()), LocalFileResolver(FileLoader())] + dependency_resolver = DependencyResolver([], notebook_resolver, import_resolvers, mock_path_lookup) + maybe = dependency_resolver.build_local_file_dependency_graph(Path("python_builtins.py.txt")) assert maybe.graph graph = maybe.graph maybe_graph = graph.locate_dependency(Path("databricks")) @@ -198,9 +207,12 @@ def test_dependency_resolver_visits_site_packages(empty_index, mock_notebook_res lookup = PathLookup.from_pathlike_string(Path.cwd(), _samples_path(SourceContainer)) lookup.append_path(site_packages_path) file_loader = FileLoader() - import_resolver = ImportFileResolver(file_loader, Whitelist()) - dependency_resolver = DependencyResolver([], mock_notebook_resolver, import_resolver, lookup) - maybe = dependency_resolver.build_local_file_dependency_graph(Path("import-site-package.py")) + import_resolvers = [ + WhitelistResolver(Whitelist()), + LocalFileResolver(file_loader), + ] + dependency_resolver = DependencyResolver([], mock_notebook_resolver, import_resolvers, lookup) + maybe = dependency_resolver.build_local_file_dependency_graph(Path("import-site-package.py.txt")) assert not maybe.failed graph = maybe.graph maybe = graph.locate_dependency(Path(site_packages_path, "certifi", "core.py")) @@ -219,20 +231,23 @@ def test_dependency_resolver_resolves_sub_site_package(): file_loader = FileLoader() notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - import_resolver = ImportFileResolver(file_loader, whitelist) - dependency_resolver = DependencyResolver([], notebook_resolver, import_resolver, lookup) - maybe = dependency_resolver.build_local_file_dependency_graph(Path("import-sub-site-package.py")) + import_resolvers = [ + LocalFileResolver(file_loader), + WhitelistResolver(whitelist), + ] + dependency_resolver = DependencyResolver([], notebook_resolver, import_resolvers, lookup) + maybe = dependency_resolver.build_local_file_dependency_graph(Path("import-sub-site-package.py.txt")) assert maybe.graph maybe = maybe.graph.locate_dependency(Path(site_packages_path, "databricks", "labs", "lsql", "core.py")) assert maybe.graph def test_dependency_resolver_raises_problem_with_unfound_root_file(mock_path_lookup, mock_notebook_resolver): - import_resolver = ImportFileResolver(FileLoader(), Whitelist()) - dependency_resolver = DependencyResolver([], mock_notebook_resolver, import_resolver, mock_path_lookup) - maybe = dependency_resolver.build_local_file_dependency_graph(Path("non-existing.py")) + import_resolvers = [LocalFileResolver(FileLoader())] + dependency_resolver = DependencyResolver([], mock_notebook_resolver, import_resolvers, mock_path_lookup) + maybe = dependency_resolver.build_local_file_dependency_graph(Path("non-existing.py.txt")) assert list(maybe.problems) == [ - DependencyProblem('file-not-found', 'File not found: non-existing.py', Path("non-existing.py")) + DependencyProblem('file-not-found', 'File not found: non-existing.py.txt', Path("non-existing.py.txt")) ] @@ -253,12 +268,12 @@ def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> So return None file_loader = FailingFileLoader() - import_resolver = ImportFileResolver(file_loader, Whitelist()) - dependency_resolver = DependencyResolver([], mock_notebook_resolver, import_resolver, mock_path_lookup) - maybe = dependency_resolver.build_local_file_dependency_graph(Path("import-sub-site-package.py")) + import_resolvers = [LocalFileResolver(file_loader)] + dependency_resolver = DependencyResolver([], mock_notebook_resolver, import_resolvers, mock_path_lookup) + maybe = dependency_resolver.build_local_file_dependency_graph(Path("import-sub-site-package.py.txt")) assert list(maybe.problems) == [ DependencyProblem( - 'cannot-load-file', 'Could not load file import-sub-site-package.py', Path('') + 'cannot-load-file', 'Could not load file import-sub-site-package.py.txt', Path('') ) ] @@ -272,15 +287,15 @@ def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> So notebook_loader = FailingNotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) dependency_resolver = DependencyResolver([], notebook_resolver, [], mock_path_lookup) - maybe = dependency_resolver.build_notebook_dependency_graph(Path("root5.py")) + maybe = dependency_resolver.build_notebook_dependency_graph(Path("root5.py.txt")) assert list(maybe.problems) == [ - DependencyProblem('cannot-load-notebook', 'Could not load notebook root5.py', Path('')) + DependencyProblem('cannot-load-notebook', 'Could not load notebook root5.py.txt', Path('')) ] def test_dependency_resolver_raises_problem_with_missing_file_loader(mock_notebook_resolver, mock_path_lookup): dependency_resolver = DependencyResolver([], mock_notebook_resolver, [], mock_path_lookup) - maybe = dependency_resolver.build_local_file_dependency_graph(Path("import-sub-site-package.py")) + maybe = dependency_resolver.build_local_file_dependency_graph(Path("import-sub-site-package.py.txt")) assert list(maybe.problems) == [ DependencyProblem('missing-file-resolver', 'Missing resolver for local files', Path('')) ] @@ -288,13 +303,8 @@ def test_dependency_resolver_raises_problem_with_missing_file_loader(mock_notebo def test_dependency_resolver_resolves_already_installed_library_dependency(mock_notebook_resolver): path_lookup = PathLookup.from_sys_path(Path.cwd()) - file_loader = FileLoader() - whitelist = Whitelist() dependency_resolver = DependencyResolver( - [PipResolver(file_loader, whitelist)], - mock_notebook_resolver, - ImportFileResolver(file_loader, whitelist), - path_lookup, + [PipResolver(FileLoader(), Whitelist())], mock_notebook_resolver, [], path_lookup ) maybe = dependency_resolver.build_library_dependency_graph(Path("astroid")) library_graph = maybe.graph diff --git a/tests/unit/source_code/test_files.py b/tests/unit/source_code/test_files.py index e5e63ab7e7..2c4aedfdc0 100644 --- a/tests/unit/source_code/test_files.py +++ b/tests/unit/source_code/test_files.py @@ -2,113 +2,81 @@ from pathlib import Path from unittest.mock import Mock, create_autospec -import pytest -from databricks.labs.blueprint.tui import MockPrompts -from databricks.labs.ucx.source_code.graph import DependencyResolver, SourceContainer -from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader -from databricks.labs.ucx.source_code.python_libraries import PipResolver -from databricks.labs.ucx.source_code.whitelist import Whitelist - from databricks.sdk.service.workspace import Language from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex -from databricks.labs.ucx.source_code.files import ( - LocalFileMigrator, - FileLoader, - LocalCodeLinter, - FolderLoader, - ImportFileResolver, - Folder, -) +from databricks.labs.ucx.source_code.files import LocalFileMigrator, LocalFileResolver, FileLoader from databricks.labs.ucx.source_code.languages import Languages from databricks.labs.ucx.source_code.path_lookup import PathLookup -from tests.unit import locate_site_packages, _samples_path -def test_migrator_fix_ignores_unsupported_extensions(): +def test_files_fix_ignores_unsupported_extensions(): languages = Languages(MigrationIndex([])) - migrator = LocalFileMigrator(lambda: languages) + files = LocalFileMigrator(languages) path = Path('unsupported.ext') - assert not migrator.apply(path) + assert not files.apply(path) -def test_migrator_fix_ignores_unsupported_language(): +def test_files_fix_ignores_unsupported_language(): languages = Languages(MigrationIndex([])) - migrator = LocalFileMigrator(lambda: languages) - migrator._extensions[".py"] = None # pylint: disable=protected-access + files = LocalFileMigrator(languages) + files._extensions[".py"] = None # pylint: disable=protected-access path = Path('unsupported.py') - assert not migrator.apply(path) + assert not files.apply(path) -def test_migrator_fix_reads_supported_extensions(migration_index): - languages = Languages(migration_index) - migrator = LocalFileMigrator(lambda: languages) +def test_files_fix_reads_supported_extensions(): + languages = Languages(MigrationIndex([])) + files = LocalFileMigrator(languages) path = Path(__file__) - assert not migrator.apply(path) + assert not files.apply(path) -def test_migrator_supported_language_no_diagnostics(): +def test_files_supported_language_no_diagnostics(): languages = create_autospec(Languages) languages.linter(Language.PYTHON).lint.return_value = [] - migrator = LocalFileMigrator(lambda: languages) + files = LocalFileMigrator(languages) path = Path(__file__) - migrator.apply(path) + files.apply(path) languages.fixer.assert_not_called() -def test_migrator_supported_language_no_fixer(): +def test_files_supported_language_no_fixer(): languages = create_autospec(Languages) languages.linter(Language.PYTHON).lint.return_value = [Mock(code='some-code')] languages.fixer.return_value = None - migrator = LocalFileMigrator(lambda: languages) + files = LocalFileMigrator(languages) path = Path(__file__) - migrator.apply(path) + files.apply(path) languages.fixer.assert_called_once_with(Language.PYTHON, 'some-code') -def test_migrator_supported_language_with_fixer(): +def test_files_supported_language_with_fixer(): languages = create_autospec(Languages) languages.linter(Language.PYTHON).lint.return_value = [Mock(code='some-code')] languages.fixer(Language.PYTHON, 'some-code').apply.return_value = "Hi there!" - migrator = LocalFileMigrator(lambda: languages) + files = LocalFileMigrator(languages) with tempfile.NamedTemporaryFile(mode="w+t", suffix=".py") as file: file.writelines(["import tempfile"]) path = Path(file.name) - migrator.apply(path) + files.apply(path) assert file.readline() == "Hi there!" -def test_migrator_walks_directory(): +def test_files_walks_directory(): languages = create_autospec(Languages) languages.linter(Language.PYTHON).lint.return_value = [Mock(code='some-code')] languages.fixer.return_value = None - migrator = LocalFileMigrator(lambda: languages) + files = LocalFileMigrator(languages) path = Path(__file__).parent - migrator.apply(path) + files.apply(path) languages.fixer.assert_called_with(Language.PYTHON, 'some-code') assert languages.fixer.call_count > 1 -def test_linter_walks_directory(mock_path_lookup, migration_index): - mock_path_lookup.append_path(_samples_path(SourceContainer)) - file_loader = FileLoader() - folder_loader = FolderLoader(file_loader) - whitelist = Whitelist() - resolver = DependencyResolver( - [PipResolver(file_loader, whitelist)], - NotebookResolver(NotebookLoader()), - ImportFileResolver(file_loader, whitelist), - mock_path_lookup, - ) - path = Path(Path(__file__).parent, "samples", "simulate-sys-path") - prompts = MockPrompts({"Which file or directory do you want to lint ?": path.as_posix()}) - linter = LocalCodeLinter(file_loader, folder_loader, mock_path_lookup, resolver, lambda: Languages(migration_index)) - advices = linter.lint(prompts, None) - assert not advices - - def test_triple_dot_import(): - file_resolver = ImportFileResolver(FileLoader(), Whitelist()) + file_loader = FileLoader() + file_resolver = LocalFileResolver(file_loader) path_lookup = create_autospec(PathLookup) path_lookup.cwd.as_posix.return_value = '/some/path/to/folder' path_lookup.resolve.return_value = Path('/some/path/foo.py') @@ -120,7 +88,8 @@ def test_triple_dot_import(): def test_single_dot_import(): - file_resolver = ImportFileResolver(FileLoader(), Whitelist()) + file_loader = FileLoader() + file_resolver = LocalFileResolver(file_loader) path_lookup = create_autospec(PathLookup) path_lookup.cwd.as_posix.return_value = '/some/path/to/folder' path_lookup.resolve.return_value = Path('/some/path/to/folder/foo.py') @@ -129,28 +98,3 @@ def test_single_dot_import(): assert not maybe.problems assert maybe.dependency.path == Path('/some/path/to/folder/foo.py') path_lookup.resolve.assert_called_once_with(Path('/some/path/to/folder/foo.py')) - - -def test_folder_has_repr(): - file_loader = FileLoader() - folder = Folder(Path("test"), file_loader, FolderLoader(file_loader)) - assert len(repr(folder)) > 0 - - -site_packages = locate_site_packages() - - -@pytest.mark.skip("Manual testing for troubleshooting") -@pytest.mark.parametrize("path", [(Path(site_packages, "mypy", "build.py"))]) -def test_known_issues(path: Path, migration_index): - file_loader = FileLoader() - folder_loader = FolderLoader(file_loader) - path_lookup = PathLookup.from_sys_path(Path.cwd()) - whitelist = Whitelist() - notebook_resolver = NotebookResolver(NotebookLoader()) - import_resolver = ImportFileResolver(file_loader, whitelist) - resolver = DependencyResolver( - [PipResolver(file_loader, whitelist)], notebook_resolver, import_resolver, path_lookup - ) - linter = LocalCodeLinter(file_loader, folder_loader, path_lookup, resolver, lambda: Languages(migration_index)) - linter.lint(MockPrompts({}), path) diff --git a/tests/unit/source_code/test_graph.py b/tests/unit/source_code/test_graph.py index e5151535c1..8a502698a3 100644 --- a/tests/unit/source_code/test_graph.py +++ b/tests/unit/source_code/test_graph.py @@ -1,20 +1,18 @@ from pathlib import Path -from databricks.labs.ucx.source_code.files import FileLoader, ImportFileResolver, FolderLoader +from databricks.labs.ucx.source_code.files import FileLoader from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyResolver, StubLibraryResolver from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader from databricks.labs.ucx.source_code.python_libraries import PipResolver -from databricks.labs.ucx.source_code.whitelist import Whitelist +from databricks.labs.ucx.source_code.whitelist import WhitelistResolver, Whitelist def test_dependency_graph_registers_library(mock_path_lookup): dependency = Dependency(FileLoader(), Path("test")) - file_loader = FileLoader() - whitelist = Whitelist() dependency_resolver = DependencyResolver( - [PipResolver(file_loader, whitelist)], + [PipResolver(FileLoader(), Whitelist())], NotebookResolver(NotebookLoader()), - ImportFileResolver(file_loader, whitelist), + [WhitelistResolver(Whitelist())], mock_path_lookup, ) graph = DependencyGraph(dependency, None, dependency_resolver, mock_path_lookup) @@ -25,23 +23,6 @@ def test_dependency_graph_registers_library(mock_path_lookup): assert graph.path_lookup.resolve(Path("pkgdir")).exists() -def test_folder_loads_content(mock_path_lookup): - path = Path(Path(__file__).parent, "samples") - file_loader = FileLoader() - whitelist = Whitelist() - dependency_resolver = DependencyResolver( - [PipResolver(file_loader, whitelist)], - NotebookResolver(NotebookLoader()), - ImportFileResolver(file_loader, whitelist), - mock_path_lookup, - ) - dependency = Dependency(FolderLoader(file_loader), path) - graph = DependencyGraph(dependency, None, dependency_resolver, mock_path_lookup) - container = dependency.load(mock_path_lookup) - container.build_dependency_graph(graph) - assert len(graph.all_paths) > 1 - - def test_stub_import_resolver_fails_with_library_not_found_dependency_problem(mock_path_lookup): resolver = StubLibraryResolver() maybe = resolver.resolve_library(mock_path_lookup, Path("test")) diff --git a/tests/unit/source_code/test_jobs.py b/tests/unit/source_code/test_jobs.py index c33f93517a..9ecbe52053 100644 --- a/tests/unit/source_code/test_jobs.py +++ b/tests/unit/source_code/test_jobs.py @@ -1,5 +1,3 @@ -import logging -import io from pathlib import Path from unittest.mock import create_autospec @@ -8,40 +6,19 @@ from databricks.labs.ucx.source_code.whitelist import Whitelist from databricks.sdk import WorkspaceClient from databricks.sdk.service import compute, jobs -from databricks.sdk.service.workspace import ExportFormat -from databricks.labs.ucx.source_code.files import FileLoader, ImportFileResolver +from databricks.labs.ucx.source_code.files import FileLoader from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyResolver -from databricks.labs.ucx.source_code.jobs import JobProblem, WorkflowLinter, WorkflowTaskContainer +from databricks.labs.ucx.source_code.jobs import WorkflowTaskContainer from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader -def test_job_problem_as_message(): - expected_message = "UNKNOWN:-1 [library-not-found] Library not found: lib" - - problem = JobProblem( - 1234, "test-job", "test-task", "UNKNOWN", "library-not-found", "Library not found: lib", -1, -1, -1, -1 - ) - - assert problem.as_message() == expected_message - - @pytest.fixture -def dependency_resolver(mock_path_lookup) -> DependencyResolver: - file_loader = FileLoader() - whitelist = Whitelist() - resolver = DependencyResolver( - [PipResolver(file_loader, whitelist)], - NotebookResolver(NotebookLoader()), - ImportFileResolver(file_loader, whitelist), - mock_path_lookup, - ) - return resolver - - -@pytest.fixture -def graph(mock_path_lookup, dependency_resolver) -> DependencyGraph: +def graph(mock_path_lookup) -> DependencyGraph: dependency = Dependency(FileLoader(), Path("test")) + dependency_resolver = DependencyResolver( + [PipResolver(FileLoader(), Whitelist())], NotebookResolver(NotebookLoader()), [], mock_path_lookup + ) dependency_graph = DependencyGraph(dependency, None, dependency_resolver, mock_path_lookup) return dependency_graph @@ -49,13 +26,13 @@ def graph(mock_path_lookup, dependency_resolver) -> DependencyGraph: def test_workflow_task_container_builds_dependency_graph_not_yet_implemented(mock_path_lookup, graph): # Goal of test is to raise test coverage, remove after implementing ws = create_autospec(WorkspaceClient) - library = compute.Library(jar="library.jar", egg="library.egg", whl="library.whl") + library = compute.Library(jar="library.jar", egg="library.egg", whl="library.whl", requirements="requirements.txt") task = jobs.Task(task_key="test", libraries=[library], existing_cluster_id="id") workflow_task_container = WorkflowTaskContainer(ws, task) problems = workflow_task_container.build_dependency_graph(graph) - assert len(problems) == 4 + assert len(problems) == 5 assert all(problem.code == "not-yet-implemented" for problem in problems) ws.assert_not_called() @@ -97,79 +74,3 @@ def test_workflow_task_container_builds_dependency_graph_unknown_pypi_library(mo assert problems[0].message.startswith("Failed to install unknown-library-name") assert mock_path_lookup.resolve(Path("unknown-library-name")) is None ws.assert_not_called() - - -def test_workflow_linter_lint_job_logs_problems(dependency_resolver, mock_path_lookup, empty_index, caplog): - expected_message = "Found job problems:\nUNKNOWN:-1 [library-install-failed] Failed to install unknown-library" - - ws = create_autospec(WorkspaceClient) - linter = WorkflowLinter(ws, dependency_resolver, mock_path_lookup, empty_index) - - libraries = [compute.Library(pypi=compute.PythonPyPiLibrary(package="unknown-library-name"))] - task = jobs.Task(task_key="test-task", libraries=libraries) - settings = jobs.JobSettings(name="test-job", tasks=[task]) - job = jobs.Job(job_id=1234, settings=settings) - - ws.jobs.get.return_value = job - - with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.source_code.jobs"): - linter.lint_job(1234) - - assert any(message.startswith(expected_message) for message in caplog.messages) - - -def test_workflow_task_container_builds_dependency_graph_for_requirements_txt(mock_path_lookup, graph): - ws = create_autospec(WorkspaceClient) - ws.workspace.download.return_value = io.BytesIO(b"test") - - libraries = [compute.Library(requirements="requirements.txt")] - task = jobs.Task(task_key="test", libraries=libraries) - - workflow_task_container = WorkflowTaskContainer(ws, task) - problems = workflow_task_container.build_dependency_graph(graph) - - assert len(problems) == 1 - assert problems[0].code == "library-install-failed" - assert problems[0].message.startswith("Failed to install") - assert mock_path_lookup.resolve(Path("test")) is None - ws.assert_not_called() - - -def test_workflow_task_container_build_dependency_graph_warns_about_reference_to_other_requirements( - mock_path_lookup, graph, caplog -): - expected_message = "References to other requirements file is not supported: -r other-requirements.txt" - - ws = create_autospec(WorkspaceClient) - ws.workspace.download.return_value = io.BytesIO(b"-r other-requirements.txt") - - libraries = [compute.Library(requirements="requirements.txt")] - task = jobs.Task(task_key="test", libraries=libraries) - - workflow_task_container = WorkflowTaskContainer(ws, task) - - with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.source_code.jobs"): - workflow_task_container.build_dependency_graph(graph) - - assert expected_message in caplog.messages - ws.workspace.download.assert_called_once_with("requirements.txt", format=ExportFormat.AUTO) - - -def test_workflow_task_container_build_dependency_graph_warns_about_reference_to_constrains( - mock_path_lookup, graph, caplog -): - expected_message = "References to constrains file is not supported: -c constrains.txt" - - ws = create_autospec(WorkspaceClient) - ws.workspace.download.return_value = io.BytesIO(b"-c constrains.txt") - - libraries = [compute.Library(requirements="requirements.txt")] - task = jobs.Task(task_key="test", libraries=libraries) - - workflow_task_container = WorkflowTaskContainer(ws, task) - - with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.source_code.jobs"): - workflow_task_container.build_dependency_graph(graph) - - assert expected_message in caplog.messages - ws.workspace.download.assert_called_once_with("requirements.txt", format=ExportFormat.AUTO) diff --git a/tests/unit/source_code/test_notebook.py b/tests/unit/source_code/test_notebook.py index f3647574f7..d4ecebc99f 100644 --- a/tests/unit/source_code/test_notebook.py +++ b/tests/unit/source_code/test_notebook.py @@ -18,42 +18,42 @@ # the following samples are real samples from https://github.com/databricks-industry-solutions # please keep them untouched, we want our unit tests to run against genuinely representative data PYTHON_NOTEBOOK_SAMPLE = ( - "00_var_context.py", + "00_var_context.py.txt", Language.PYTHON, ['md', 'md', 'md', 'python', 'python', 'python', 'md', 'python', 'md'], ) PYTHON_NOTEBOOK_WITH_RUN_SAMPLE = ( - "01_var_market_etl.py", + "01_var_market_etl.py.txt", Language.PYTHON, ['md', 'run', 'md', 'python', 'md', 'python', 'python', 'python', 'python', 'md', 'python', 'python', 'md', 'python', 'python', 'python', 'md', 'python', 'md', 'python', 'python', 'md', 'python'], ) SCALA_NOTEBOOK_SAMPLE = ( - "01_HL7Streaming.scala", + "01_HL7Streaming.scala.txt", Language.SCALA, ['md', 'md', 'scala', 'sql', 'md', 'scala', 'scala', 'md', 'md', 'scala', 'md', 'scala', 'sql', 'sql', 'md', 'scala', 'md', 'scala', 'sql', 'sql', 'sql', 'sql', 'sql', 'sql', 'sql', 'md', 'scala', 'md', 'md'], ) R_NOTEBOOK_SAMPLE = ( - "3_SparkR_Fine Grained Demand Forecasting.r", + "3_SparkR_Fine Grained Demand Forecasting.r.txt", Language.R, ['md', 'md', 'md', 'r', 'r', 'md', 'run', 'r', 'md', 'sql', 'md', 'sql', 'md', 'sql', 'md', 'md', 'r', 'md', 'r', 'md', 'r', 'md', 'r', 'md', 'r', 'md', 'md', 'r', 'md', 'md', 'r', 'md', 'r', 'md', 'r', 'md', 'md', 'r', 'md', 'r', 'md', 'r', 'md', 'r', 'md', 'sql', 'md', 'sql', 'md'], ) SQL_NOTEBOOK_SAMPLE = ( - "chf-pqi-scoring.sql", + "chf-pqi-scoring.sql.txt", Language.SQL, ['md', 'sql', 'sql', 'md', 'sql', 'python', 'sql', 'sql', 'sql', 'md', 'sql', 'sql', 'md', 'sql', 'sql', 'md', 'sql'], ) SHELL_NOTEBOOK_SAMPLE = ( - "notebook-with-shell-cell.py", + "notebook-with-shell-cell.py.txt", Language.PYTHON, ['python', 'sh'], ) PIP_NOTEBOOK_SAMPLE = ( - "notebook-with-pip-cell.py", + "notebook-with-pip-cell.py.txt", Language.PYTHON, ['python', 'pip'], ) @@ -129,12 +129,12 @@ def test_notebook_builds_leaf_dependency_graph(mock_path_lookup): notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) dependency_resolver = DependencyResolver([], notebook_resolver, [], mock_path_lookup) - maybe = dependency_resolver.resolve_notebook(mock_path_lookup, Path("leaf1.py")) + maybe = dependency_resolver.resolve_notebook(mock_path_lookup, Path("leaf1.py.txt")) graph = DependencyGraph(maybe.dependency, None, dependency_resolver, mock_path_lookup) container = maybe.dependency.load(mock_path_lookup) problems = container.build_dependency_graph(graph) assert not problems - assert graph.all_paths == {mock_path_lookup.cwd / "leaf1.py"} + assert graph.all_paths == {mock_path_lookup.cwd / "leaf1.py.txt"} def get_status_side_effect(*args): @@ -143,7 +143,7 @@ def get_status_side_effect(*args): def test_notebook_builds_depth1_dependency_graph(mock_path_lookup): - paths = ["root1.run.py", "leaf1.py", "leaf2.py"] + paths = ["root1.run.py.txt", "leaf1.py.txt", "leaf2.py.txt"] notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) dependency_resolver = DependencyResolver([], notebook_resolver, [], mock_path_lookup) @@ -156,7 +156,7 @@ def test_notebook_builds_depth1_dependency_graph(mock_path_lookup): def test_notebook_builds_depth2_dependency_graph(mock_path_lookup): - paths = ["root2.run.py", "root1.run.py", "leaf1.py", "leaf2.py"] + paths = ["root2.run.py.txt", "root1.run.py.txt", "leaf1.py.txt", "leaf2.py.txt"] notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) dependency_resolver = DependencyResolver([], notebook_resolver, [], mock_path_lookup) @@ -169,7 +169,7 @@ def test_notebook_builds_depth2_dependency_graph(mock_path_lookup): def test_notebook_builds_dependency_graph_avoiding_duplicates(mock_path_lookup): - paths = ["root3.run.py", "root1.run.py", "leaf1.py", "leaf2.py"] + paths = ["root3.run.py.txt", "root1.run.py.txt", "leaf1.py.txt", "leaf2.py.txt"] notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) dependency_resolver = DependencyResolver([], notebook_resolver, [], mock_path_lookup) @@ -183,7 +183,7 @@ def test_notebook_builds_dependency_graph_avoiding_duplicates(mock_path_lookup): def test_notebook_builds_cyclical_dependency_graph(mock_path_lookup): - paths = ["cyclical1.run.py", "cyclical2.run.py"] + paths = ["cyclical1.run.py.txt", "cyclical2.run.py.txt"] notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) dependency_resolver = DependencyResolver([], notebook_resolver, [], mock_path_lookup) @@ -196,7 +196,7 @@ def test_notebook_builds_cyclical_dependency_graph(mock_path_lookup): def test_notebook_builds_python_dependency_graph(mock_path_lookup): - paths = ["root4.py", "leaf3.py"] + paths = ["root4.py.txt", "leaf3.py.txt"] notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) dependency_resolver = DependencyResolver([], notebook_resolver, [], mock_path_lookup) @@ -209,7 +209,7 @@ def test_notebook_builds_python_dependency_graph(mock_path_lookup): def test_detects_manual_migration_in_dbutils_notebook_run_in_python_code_(): - sources: list[str] = _load_sources(SourceContainer, "run_notebooks.py") + sources: list[str] = _load_sources(SourceContainer, "run_notebooks.py.txt") linter = PythonLinter() advices = list(linter.lint(sources[0])) assert [ @@ -225,7 +225,7 @@ def test_detects_manual_migration_in_dbutils_notebook_run_in_python_code_(): def test_detects_automatic_migration_in_dbutils_notebook_run_in_python_code_(): - sources: list[str] = _load_sources(SourceContainer, "root4.py") + sources: list[str] = _load_sources(SourceContainer, "root4.py.txt") linter = PythonLinter() advices = list(linter.lint(sources[0])) assert [ @@ -235,7 +235,7 @@ def test_detects_automatic_migration_in_dbutils_notebook_run_in_python_code_(): start_line=2, start_col=0, end_line=2, - end_col=34, + end_col=38, ) ] == advices diff --git a/tests/unit/source_code/test_path_lookup_simulation.py b/tests/unit/source_code/test_path_lookup_simulation.py index f5959fbc40..1e97bbc2ac 100644 --- a/tests/unit/source_code/test_path_lookup_simulation.py +++ b/tests/unit/source_code/test_path_lookup_simulation.py @@ -2,11 +2,11 @@ from tempfile import TemporaryDirectory import pytest -from databricks.labs.ucx.source_code.files import ImportFileResolver, FileLoader +from databricks.labs.ucx.source_code.files import LocalFileResolver, FileLoader from databricks.labs.ucx.source_code.path_lookup import PathLookup from databricks.labs.ucx.source_code.graph import SourceContainer, DependencyResolver from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader -from databricks.labs.ucx.source_code.whitelist import Whitelist +from databricks.labs.ucx.source_code.whitelist import WhitelistResolver, Whitelist from tests.unit import ( _samples_path, ) @@ -41,8 +41,11 @@ def test_locates_notebooks(source: list[str], expected: int, mock_path_lookup): file_loader = FileLoader() notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - import_resolver = ImportFileResolver(file_loader, Whitelist()) - dependency_resolver = DependencyResolver([], notebook_resolver, import_resolver, mock_path_lookup) + import_resolvers = [ + WhitelistResolver(Whitelist()), + LocalFileResolver(file_loader), + ] + dependency_resolver = DependencyResolver([], notebook_resolver, import_resolvers, mock_path_lookup) maybe = dependency_resolver.build_notebook_dependency_graph(notebook_path) assert not maybe.problems assert maybe.graph is not None @@ -66,8 +69,11 @@ def test_locates_files(source: list[str], expected: int): file_loader = FileLoader() notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - import_resolver = ImportFileResolver(file_loader, whitelist) - resolver = DependencyResolver([], notebook_resolver, import_resolver, lookup) + import_resolvers = [ + WhitelistResolver(whitelist), + LocalFileResolver(file_loader), + ] + resolver = DependencyResolver([], notebook_resolver, import_resolvers, lookup) maybe = resolver.build_local_file_dependency_graph(file_path) assert not maybe.problems assert maybe.graph is not None @@ -99,11 +105,16 @@ def test_locates_notebooks_with_absolute_path(): """, "utf-8", ) + whitelist = Whitelist() lookup = PathLookup.from_sys_path(Path.cwd()) + file_loader = FileLoader() notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - import_resolver = ImportFileResolver(FileLoader(), Whitelist()) - resolver = DependencyResolver([], notebook_resolver, import_resolver, lookup) + import_resolvers = [ + WhitelistResolver(whitelist), + LocalFileResolver(file_loader), + ] + resolver = DependencyResolver([], notebook_resolver, import_resolvers, lookup) maybe = resolver.build_notebook_dependency_graph(parent_file_path) assert not maybe.problems assert maybe.graph is not None @@ -135,11 +146,16 @@ def func(): """, "utf-8", ) + whitelist = Whitelist() lookup = PathLookup.from_sys_path(Path.cwd()) + file_loader = FileLoader() notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - import_resolver = ImportFileResolver(FileLoader(), Whitelist()) - resolver = DependencyResolver([], notebook_resolver, import_resolver, lookup) + import_resolvers = [ + WhitelistResolver(whitelist), + LocalFileResolver(file_loader), + ] + resolver = DependencyResolver([], notebook_resolver, import_resolvers, lookup) maybe = resolver.build_notebook_dependency_graph(parent_file_path) assert not maybe.problems assert maybe.graph is not None diff --git a/tests/unit/source_code/test_python_libraries.py b/tests/unit/source_code/test_python_libraries.py index d85b84c04b..912f8d3fe3 100644 --- a/tests/unit/source_code/test_python_libraries.py +++ b/tests/unit/source_code/test_python_libraries.py @@ -1,8 +1,6 @@ from pathlib import Path -from unittest.mock import create_autospec from databricks.labs.ucx.source_code.files import FileLoader -from databricks.labs.ucx.source_code.graph import DependencyProblem from databricks.labs.ucx.source_code.path_lookup import PathLookup from databricks.labs.ucx.source_code.python_libraries import DistInfoPackage, PipResolver from databricks.labs.ucx.source_code.whitelist import Whitelist @@ -27,18 +25,6 @@ def test_pip_resolver_does_not_resolve_unknown_library(mock_path_lookup): assert mock_path_lookup.resolve(Path("unknown-library-name")) is None -def test_pip_resolver_locates_dist_info_without_parent(): - mock_path_lookup = create_autospec(PathLookup) - mock_path_lookup.resolve.return_value = Path("/non/existing/path/") - - pip_resolver = PipResolver(FileLoader(), Whitelist()) - maybe = pip_resolver.resolve_library(mock_path_lookup, Path("library")) - - assert len(maybe.problems) == 1 - assert maybe.problems[0] == DependencyProblem("no-dist-info", "No dist-info found for library") - mock_path_lookup.resolve.assert_called_once() - - def test_dist_info_package_parses_installed_package_with_toplevel(): site_packages_path = locate_site_packages() astroid_path = Path(site_packages_path, "astroid-3.1.0.dist-info") diff --git a/tests/unit/source_code/test_python_linter.py b/tests/unit/source_code/test_python_linter.py index 83a6918bad..b1694e85c9 100644 --- a/tests/unit/source_code/test_python_linter.py +++ b/tests/unit/source_code/test_python_linter.py @@ -2,7 +2,6 @@ import ast import pytest -from databricks.labs.ucx.source_code.graph import DependencyProblem from databricks.labs.ucx.source_code.python_linter import ASTLinter, PythonLinter @@ -25,27 +24,27 @@ def test_linter_returns_list_of_dbutils_notebook_run_calls(): def test_linter_returns_empty_list_of_imports(): linter = ASTLinter.parse('') - assert [] == PythonLinter.list_import_sources(linter, DependencyProblem)[0] + assert [] == PythonLinter.list_import_sources(linter) def test_linter_returns_import(): linter = ASTLinter.parse('import x') - assert ["x"] == [node.name for node in PythonLinter.list_import_sources(linter, DependencyProblem)[0]] + assert ["x"] == [node.name for node in PythonLinter.list_import_sources(linter)] def test_linter_returns_import_from(): linter = ASTLinter.parse('from x import z') - assert ["x"] == [node.name for node in PythonLinter.list_import_sources(linter, DependencyProblem)[0]] + assert ["x"] == [node.name for node in PythonLinter.list_import_sources(linter)] def test_linter_returns_import_module(): linter = ASTLinter.parse('importlib.import_module("x")') - assert ["x"] == [node.name for node in PythonLinter.list_import_sources(linter, DependencyProblem)[0]] + assert ["x"] == [node.name for node in PythonLinter.list_import_sources(linter)] def test_linter_returns__import__(): linter = ASTLinter.parse('importlib.__import__("x")') - assert ["x"] == [node.name for node in PythonLinter.list_import_sources(linter, DependencyProblem)[0]] + assert ["x"] == [node.name for node in PythonLinter.list_import_sources(linter)] def test_linter_returns_appended_absolute_paths(): diff --git a/tests/unit/source_code/test_s3fs.py b/tests/unit/source_code/test_s3fs.py index 93b80918ef..b6c86f4cd0 100644 --- a/tests/unit/source_code/test_s3fs.py +++ b/tests/unit/source_code/test_s3fs.py @@ -6,9 +6,9 @@ DependencyResolver, DependencyProblem, ) -from databricks.labs.ucx.source_code.files import FileLoader, ImportFileResolver +from databricks.labs.ucx.source_code.files import FileLoader, LocalFileResolver from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader, NotebookResolver -from databricks.labs.ucx.source_code.whitelist import Whitelist +from databricks.labs.ucx.source_code.whitelist import WhitelistResolver, Whitelist S3FS_DEPRECATION_MESSAGE = "Use of dependency s3fs is deprecated" @@ -23,7 +23,7 @@ DependencyProblem( code='dependency-check', message=S3FS_DEPRECATION_MESSAGE, - source_path=Path('path.py'), + source_path=Path('path.py.txt'), start_line=1, start_col=0, end_line=1, @@ -37,7 +37,7 @@ DependencyProblem( code='dependency-check', message=S3FS_DEPRECATION_MESSAGE, - source_path=Path('path.py'), + source_path=Path('path.py.txt'), start_line=1, start_col=0, end_line=1, @@ -53,7 +53,7 @@ DependencyProblem( code='dependency-check', message=S3FS_DEPRECATION_MESSAGE, - source_path=Path('path.py'), + source_path=Path('path.py.txt'), start_line=1, start_col=0, end_line=1, @@ -68,7 +68,7 @@ DependencyProblem( code='dependency-check', message=S3FS_DEPRECATION_MESSAGE, - source_path=Path('path.py'), + source_path=Path('path.py.txt'), start_line=2, start_col=4, end_line=2, @@ -82,7 +82,7 @@ DependencyProblem( code='dependency-check', message=S3FS_DEPRECATION_MESSAGE, - source_path=Path('path.py'), + source_path=Path('path.py.txt'), start_line=1, start_col=0, end_line=1, @@ -96,7 +96,7 @@ DependencyProblem( code='dependency-check', message='Use of dependency s3fs.subpackage is deprecated', - source_path=Path('path.py'), + source_path=Path('path.py.txt'), start_line=1, start_col=0, end_line=1, @@ -116,8 +116,8 @@ def test_detect_s3fs_import(empty_index, source: str, expected: list[DependencyP notebook_loader = NotebookLoader() file_loader = FileLoader() notebook_resolver = NotebookResolver(notebook_loader) - import_resolver = ImportFileResolver(file_loader, whitelist) - dependency_resolver = DependencyResolver([], notebook_resolver, import_resolver, mock_path_lookup) + import_resolvers = [LocalFileResolver(file_loader), WhitelistResolver(whitelist)] + dependency_resolver = DependencyResolver([], notebook_resolver, import_resolvers, mock_path_lookup) maybe = dependency_resolver.build_local_file_dependency_graph(sample) assert maybe.problems == [_.replace(source_path=sample) for _ in expected] @@ -129,7 +129,7 @@ def test_detect_s3fs_import(empty_index, source: str, expected: list[DependencyP DependencyProblem( code='dependency-check', message='Use of dependency s3fs is deprecated', - source_path=Path('leaf9.py'), + source_path=Path('leaf9.py.txt'), start_line=1, start_col=0, end_line=1, @@ -144,8 +144,8 @@ def test_detect_s3fs_import_in_dependencies( yml = mock_path_lookup.cwd / "s3fs-python-compatibility-catalog.yml" file_loader = FileLoader() whitelist = Whitelist.parse(yml.read_text(), use_defaults=True) - import_resolver = ImportFileResolver(file_loader, whitelist) - dependency_resolver = DependencyResolver([], mock_notebook_resolver, import_resolver, mock_path_lookup) - sample = mock_path_lookup.cwd / "root9.py" + import_resolvers = [LocalFileResolver(file_loader), WhitelistResolver(whitelist)] + dependency_resolver = DependencyResolver([], mock_notebook_resolver, import_resolvers, mock_path_lookup) + sample = mock_path_lookup.cwd / "root9.py.txt" maybe = dependency_resolver.build_local_file_dependency_graph(sample) assert maybe.problems == expected diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index d8c384fa94..d4e9526d28 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -388,23 +388,7 @@ def test_migrate_locations_azure(ws): def test_migrate_locations_aws(ws, caplog): - successful_return = """ - { - "UserId": "uu@mail.com", - "Account": "1234", - "Arn": "arn:aws:sts::1234:assumed-role/AWSVIEW/uu@mail.com" - } - """ - - def successful_call(_): - return 0, successful_return, "" - - ctx = WorkspaceContext(ws).replace( - is_aws=True, - is_azure=False, - aws_profile="profile", - aws_cli_run_command=successful_call, - ) + ctx = WorkspaceContext(ws).replace(is_aws=True, is_azure=False, aws_profile="profile") migrate_locations(ws, ctx=ctx) ws.external_locations.list.assert_called() @@ -558,10 +542,3 @@ def test_migrate_dbsql_dashboards(ws, caplog): def test_revert_dbsql_dashboards(ws, caplog): revert_dbsql_dashboards(ws) ws.dashboards.list.assert_called_once() - - -def test_cli_missing_awscli(ws, mocker, caplog): - mocker.patch("shutil.which", side_effect=ValueError("Couldn't find AWS CLI in path")) - with pytest.raises(ValueError): - ctx = WorkspaceContext(ws).replace(is_aws=True, is_azure=False, aws_profile="profile") - migrate_locations(ws, ctx) diff --git a/tests/unit/test_install.py b/tests/unit/test_install.py index 2ff8e97839..b430e94a10 100644 --- a/tests/unit/test_install.py +++ b/tests/unit/test_install.py @@ -438,7 +438,6 @@ def test_save_config(ws, mock_installation): r"Choose how to map the workspace groups.*": "2", r".*": "", r".*days to analyze submitted runs.*": "1", - r"Reconciliation threshold, in percentage.*": "5", } ) install = WorkspaceInstaller(ws).replace( @@ -463,7 +462,6 @@ def test_save_config(ws, mock_installation): 'renamed_group_prefix': 'db-temp-', 'warehouse_id': 'abc', 'workspace_start_path': '/', - 'recon_tolerance_percent': 5, }, ) @@ -497,7 +495,6 @@ def test_save_config_strip_group_names(ws, mock_installation): r"Choose how to map the workspace groups.*": "2", # specify names r".*workspace group names.*": "g1, g2, g99", r".*": "", - r"Reconciliation threshold, in percentage.*": "5", } ) ws.workspace.get_status = not_found @@ -525,7 +522,6 @@ def test_save_config_strip_group_names(ws, mock_installation): 'renamed_group_prefix': 'db-temp-', 'warehouse_id': 'abc', 'workspace_start_path': '/', - 'recon_tolerance_percent': 5, }, ) @@ -546,7 +542,6 @@ def test_create_cluster_policy(ws, mock_installation): r".*workspace group names.*": "g1, g2, g99", r".*We have identified one or more cluster.*": "No", r".*Choose a cluster policy.*": "0", - r"Reconciliation threshold, in percentage.*": "5", r".*": "", } ) @@ -573,7 +568,6 @@ def test_create_cluster_policy(ws, mock_installation): 'renamed_group_prefix': 'db-temp-', 'warehouse_id': 'abc', 'workspace_start_path': '/', - 'recon_tolerance_percent': 5, }, ) @@ -1200,7 +1194,6 @@ def test_save_config_should_include_databases(ws, mock_installation): r".*PRO or SERVERLESS SQL warehouse.*": "1", r"Choose how to map the workspace groups.*": "2", # specify names r"Comma-separated list of databases to migrate.*": "db1,db2", - r"Reconciliation threshold, in percentage.*": "5", r".*": "", } ) @@ -1228,7 +1221,6 @@ def test_save_config_should_include_databases(ws, mock_installation): 'warehouse_id': 'abc', 'workspace_start_path': '/', 'num_days_submit_runs_history': 30, - 'recon_tolerance_percent': 5, }, ) @@ -1364,7 +1356,6 @@ def test_fresh_install(ws, mock_installation): r"Parallelism for migrating.*": "1000", r"Min workers for auto-scale.*": "2", r"Max workers for auto-scale.*": "20", - r"Reconciliation threshold, in percentage.*": "5", r".*": "", } ) @@ -1393,7 +1384,6 @@ def test_fresh_install(ws, mock_installation): 'renamed_group_prefix': 'db-temp-', 'warehouse_id': 'abc', 'workspace_start_path': '/', - 'recon_tolerance_percent': 5, }, ) @@ -1867,7 +1857,6 @@ def test_save_config_ext_hms(ws, mock_installation): r".*PRO or SERVERLESS SQL warehouse.*": "1", r"Choose how to map the workspace groups.*": "2", # specify names r"Comma-separated list of databases to migrate.*": "db1,db2", - r"Reconciliation threshold, in percentage.*": "5", r".*": "", } ) @@ -1895,7 +1884,6 @@ def test_save_config_ext_hms(ws, mock_installation): 'warehouse_id': 'abc', 'workspace_start_path': '/', 'num_days_submit_runs_history': 30, - 'recon_tolerance_percent': 5, }, )