From 24e37a4ff1827bb6afd4daa020fce17c8568f0f1 Mon Sep 17 00:00:00 2001 From: Justin Boyd Date: Tue, 18 Jul 2023 09:52:44 -0400 Subject: [PATCH] adding Docker packaging to support non-linux/macOS environments --- docker/README.md | 43 ++++++++++++++++++++++++++++++++ docker/databricks/.databrickscfg | 9 +++++++ docker/dockerfile | 21 ++++++++++++++++ 3 files changed, 73 insertions(+) create mode 100644 docker/README.md create mode 100644 docker/databricks/.databrickscfg create mode 100644 docker/dockerfile diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 0000000..b0fe8cc --- /dev/null +++ b/docker/README.md @@ -0,0 +1,43 @@ +# OS agnostic runtime for Databricks Migrate +[Databricks Migrate](https://github.com/databrickslabs/migrate) is a tool from [Databricks Labs](https://github.com/databrickslabs) that facilitates migration of objects in one Databricks workspace to another. The tool requires a Unix/Linux OS environment to operate. The packaging in this repo allows the Migrate tool to be run in other OS environments (i.e. Windows) using Docker. + +## Local Environment Prerequisites +The set of scripts located in this directory have the following local dependencies: +* [Docker Desktop](https://docs.docker.com/desktop/install/windows-install/) + +## Getting Started +_Note: For the following steps, the legacy workspace that is being migrated `from` is aliased as `oldWS`. The new workspace that is being migrated `to` is aliased as `newWS`._ + +1. Open the `.\databricks\.databrickscfg` file in this repo. There are entries for `oldWS` and `newWS` in this file. Replace the value for each `host` with the appropriate URL for that workspace. The URL will be of the format `https://`. +2. For each workspace, [generate an access token](https://docs.databricks.com/dev-tools/auth.html#personal-access-tokens-for-users) for a user that has admin privileges. +3. Copy the generated token into the `.\databricks\.databrickscfg` file in this repo. Replace the value for each `token` in the file with the appropriate token for that workspace. +4. Save the changes to `.\databricks\.databrickscfg`. + + +## Environment Setup +With the docker daemon running, navigate to this directory in your local shell, and execute the following command. +``` +docker build -t databricks-migrate . +``` +Once the image is done building, execute the following command from your shell to start the container. Your shell will be redirected to `/opt/migrate` inside the running container. The tool is now ready for use. +``` +docker run -it --name databricks-migrate -v .\databricks\.databrickscfg:/root/.databrickscfg -v .\databricks:/databricks databricks-migrate +``` + +## Usage +For CDWNG workspaces, the following commands have been used to handle migration of specific Databricks objects. + +Export objects from `oldWS`: +``` +python migration_pipeline.py --azure --profile oldWS --export-pipeline --set-export-dir $SESSIONS_DIR --notebook-format SOURCE --keep-tasks users groups workspace_item_log workspace_acls notebooks clusters instance_pools jobs +``` +Import objects to `newWS`: + +> Note: the `--session` parameter needs to be set to that generated from the export session above. Here it is set as a environment variable $EXPORT_SESSION + +``` +python migration_pipeline.py --azure --profile newWS --import-pipeline --set-export-dir $SESSIONS_DIR --notebook-format SOURCE --session $EXPORT_SESSION --keep-tasks users groups workspace_item_log workspace_acls notebooks clusters instance_pools jobs +``` + +## Output and Logging +The `.\databricks` directory in this repo is mounted to the running container. This in combination with the preset $SESSIONS_DIR environment variable (see Usage above) allows the output of the migration pipeline to be available and persisted on the local host. \ No newline at end of file diff --git a/docker/databricks/.databrickscfg b/docker/databricks/.databrickscfg new file mode 100644 index 0000000..a2ff9f7 --- /dev/null +++ b/docker/databricks/.databrickscfg @@ -0,0 +1,9 @@ +[oldWS] +host = https:// +token = dapi... +jobs-api-version = 2.0 + +[newWS] +host = https:// +token = dapi... +jobs-api-version = 2.0 \ No newline at end of file diff --git a/docker/dockerfile b/docker/dockerfile new file mode 100644 index 0000000..f401ce9 --- /dev/null +++ b/docker/dockerfile @@ -0,0 +1,21 @@ +FROM python:3 + +# mount the host dir with this file to the container DBX_DIR on container startup +ENV DBX_DIR=/databricks +ENV SESSIONS_DIR=${DBX_DIR}/migrate-sessions +ENV INSTALL_DIR=/opt + +RUN mkdir -p ${SESSIONS_DIR} + +# install and configure databricks-cli +RUN pip install --upgrade pip +RUN pip install --upgrade databricks-cli + +# install and init the migrate utility +RUN cd ${INSTALL_DIR} \ + && git clone https://github.com/databrickslabs/migrate.git \ + && cd migrate \ + && python setup.py install + +WORKDIR ${INSTALL_DIR}/migrate +ENTRYPOINT [ "/bin/bash" ] \ No newline at end of file