refactor(python): Use Poetry for package management

In CI, since the packages are now installed in a virtualenv rather than globally, we have to activate the virtualenv at the start of jobs. The cattrs package has to be kept back to before version 1.1.0 until we either upgrade to Python 3.7 or AWS CDK moves to a dependency which doesn't transitively depend on Python 3.7. Issue linked on the relevant pyproject.toml line. This merges the production dependencies of the backend and infra subdirectories, which is not ideal. Once subproject support <python-poetry/poetry#2270> arrives we should pull this apart, maybe keeping only the development and test dependencies in the root. To mitigate this in the meantime, bundle.bash pulls out only those dependencies which the Lambda function needs.
linz · Nov 9, 2020 · c88dd6a · c88dd6a
1 parent 75be132
commit c88dd6a
Show file tree

Hide file tree

Showing 11 changed files with 2,148 additions and 82 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -22,32 +22,31 @@ jobs:
             - name: Install Python dependencies
               run: |
                   python -m pip install --upgrade pip
-                  pip install -r requirements-dev.txt
-                  pip install -r infra/requirements.txt
-                  pip install -r backend/endpoints/datasets/requirements.txt
+                  python -m pip install poetry
+                  python -m poetry install
 
             - name: Check last commit message
               if: github.event_name == 'push'
               run: |
-                  gitlint
+                  poetry run gitlint
 
             - name: Check all commit messages in Pull Request
               if: github.event_name == 'pull_request'
               run: >
-                  gitlint --commits
+                  poetry run gitlint --commits
                   origin/${{ github.base_ref }}..${{ github.event.pull_request.head.sha }}
 
             - name: Check Python code formatting
               run: |
-                  black . --check --diff
+                  poetry run black . --check --diff
 
             - name: Check Python code quality
               run: |
-                  pylint backend/ infra/
+                  poetry run pylint backend/ infra/
 
             - name: Check Python code import statements
               run: |
-                  isort . --check --diff
+                  poetry run isort . --check --diff
 
 
     test:
@@ -69,12 +68,12 @@ jobs:
             - name: Install Python dependencies
               run: |
                   python -m pip install --upgrade pip
-                  pip install -r requirements-dev.txt
-                  pip install -r infra/requirements.txt
+                  python -m pip install poetry
+                  python -m poetry install
 
             - name: Run unit tests
               run: |
-                  pytest tests/
+                  poetry run pytest tests/
 
 
     test-infra:
@@ -96,9 +95,8 @@ jobs:
             - name: Install Python dependencies
               run: |
                   python -m pip install --upgrade pip
-                  pip install -r requirements-dev.txt
-                  pip install -r infra/requirements.txt
-                  pip install -r backend/endpoints/datasets/requirements.txt
+                  python -m pip install poetry
+                  python -m poetry install
 
             - name: Use Node.js 12.x for CDK deployment
               uses: actions/setup-node@v2.1.2
@@ -110,7 +108,7 @@ jobs:
               run: npm install -g aws-cdk
 
             - name: Print CDK version
-              run: cdk --version
+              run: poetry run cdk --version
 
             - name: Configure AWS credentials
               uses: aws-actions/configure-aws-credentials@v1
@@ -128,21 +126,21 @@ jobs:
 
             - name: Deploy AWS stack for testing
               run: |
-                  cdk bootstrap aws://unknown-account/ap-southeast-2
-                  cdk deploy --require-approval never geospatial-data-lake
+                  poetry run cdk bootstrap aws://unknown-account/ap-southeast-2
+                  poetry run cdk deploy --require-approval never geospatial-data-lake
               working-directory: infra
 
             - name: Run AWS infra tests
               run: |
-                  pytest infra/tests/
+                  poetry run pytest infra/tests/
 
             - name: Run AWS backend tests
               run: |
-                  pytest backend/tests/
+                  poetry run pytest backend/tests/
 
             - name: Destroy AWS stack used for testing
               run: |
-                  cdk destroy --force geospatial-data-lake
+                  poetry run cdk destroy --force geospatial-data-lake
               working-directory: infra
 
 
@@ -167,8 +165,8 @@ jobs:
             - name: Install Python dependencies
               run: |
                   python -m pip install --upgrade pip
-                  pip install -r requirements-dev.txt
-                  pip install -r infra/requirements.txt
+                  python -m pip install poetry
+                  python -m poetry install --no-dev
 
             - name: Use Node.js 12.x for CDK deployment
               uses: actions/setup-node@v2.1.2
@@ -180,7 +178,7 @@ jobs:
               run: npm install -g aws-cdk
 
             - name: Print CDK version
-              run: cdk --version
+              run: poetry run cdk --version
 
             # NONPROD DEPLOYMENT
             - name: (NonProd) Configure AWS credentials
@@ -199,9 +197,11 @@ jobs:
               if: >
                   github.ref == 'refs/heads/master'
                   && github.repository == 'linz/geospatial-data-lake'
+              env:
+                  DEPLOY_ENV: nonprod
               run: |
-                  cdk bootstrap aws://unknown-account/ap-southeast-2
-                  DEPLOY_ENV=nonprod cdk deploy --require-approval never geospatial-data-lake
+                  poetry run cdk bootstrap aws://unknown-account/ap-southeast-2
+                  poetry run cdk deploy --require-approval never geospatial-data-lake
               working-directory: infra
 
             # PROD DEPLOYMENT
@@ -221,7 +221,9 @@ jobs:
               if: >
                   startsWith(github.ref, 'release')
                   && github.repository == 'linz/geospatial-data-lake'
+              env:
+                  DEPLOY_ENV: prod
               run: |
-                  cdk bootstrap aws://unknown-account/ap-southeast-2
-                  DEPLOY_ENV=prod cdk deploy --require-approval never geospatial-data-lake
+                  poetry run cdk bootstrap aws://unknown-account/ap-southeast-2
+                  poetry run cdk deploy --require-approval never geospatial-data-lake
               working-directory: infra
diff --git a/README.md b/README.md
@@ -14,12 +14,19 @@ $ python3 -m venv .venv
 $ source .venv/bin/activate
 ```
 
-* Upgrade pip and install the required dependencies
+* Upgrade pip
 
 ```bash
 $ pip install --upgrade pip
 ```
 
+* [Install Poetry](https://python-poetry.org/docs/#installation)
+
+* Install the dependencies:
+
+```bash
+$ poetry install
+```
 
 ## AWS CDK Environment (AWS Infrastructure)
 * Install NVM (use latest version)
@@ -50,12 +57,6 @@ $ npm install -g aws-cdk
 
 
 ## AWS Infrastructure Deployment (CDK Stack)
-* Install Python CDK dependencies
-
-```bash
-$ pip install -r infra/requirements.txt
-```
-
 * Get AWS credentials (see: https://www.npmjs.com/package/aws-azure-login)
 
 ```bash
@@ -72,12 +73,6 @@ $ cdk deploy --profile <geospatial-data-lake-nonprod|geospatial-data-lake-prod>
 
 
 ## Development
-* Install Python development dependencies
-
-```bash
-$ pip install -r requirements-dev.txt
-```
-
 * Install commit-msg git hook
 
 ```bash

diff --git a/backend/bundle.bash b/backend/bundle.bash
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+set -o errexit -o noclobber -o nounset
+
+script_dir="$(dirname "$0")"
+
+python -m venv .venv
+. .venv/bin/activate
+python -m pip install --upgrade pip
+python -m pip install poetry
+
+work_dir="$(mktemp --directory)"
+all_requirements_file="${work_dir}/all-requirements.txt"
+backend_requirements_file="${work_dir}/backend-requirements.txt"
+
+# Get requirements file for entries in requirements.txt
+poetry export --output="$all_requirements_file" --without-hashes
+grep --file="${script_dir}/requirements.txt" "$all_requirements_file" > "$backend_requirements_file"
+
+pip install --requirement="$backend_requirements_file" --target=/asset-output
+cp --archive --update --verbose "${script_dir}/endpoints" /asset-output/
diff --git a/backend/endpoints/datasets/bundle.sh b/backend/endpoints/datasets/bundle.sh
diff --git a/backend/endpoints/datasets/requirements.txt → backend/requirements.txt b/backend/endpoints/datasets/requirements.txt → backend/requirements.txt
diff --git a/infra/data_stores/data_lake_stack.py b/infra/data_stores/data_lake_stack.py
@@ -50,25 +50,22 @@ def __init__(self, scope: core.Construct, id: str, **kwargs) -> None:
         Tags.of(db_datasets_table).add("ApplicationLayer", "application-db")
 
         # Lambda Handler Functions
-        lambda_path = "../backend/endpoints/datasets"
-        dataset_handler_function = aws_lambda.Function(
-            self,
-            "datasets-endpoint-function",
-            function_name="datasets-endpoint",
-            handler="endpoints.datasets.entrypoint.lambda_handler",
-            runtime=aws_lambda.Runtime.PYTHON_3_6,
-            code=aws_lambda.Code.from_asset(
-                path=os.path.dirname(lambda_path),
-                bundling=core.BundlingOptions(
-                    image=aws_lambda.Runtime.PYTHON_3_6.bundling_docker_image,  # pylint:disable=no-member
-                    command=[
-                        "bash",
-                        "-c",
-                        open(f"{lambda_path}/bundle.sh", "r").read(),
-                    ],
+        project_path = ".."
+        with open(os.path.join(project_path, "backend/bundle.bash"), "r") as bundler:
+            dataset_handler_function = aws_lambda.Function(
+                self,
+                "datasets-endpoint-function",
+                function_name="datasets-endpoint",
+                handler="endpoints.datasets.entrypoint.lambda_handler",
+                runtime=aws_lambda.Runtime.PYTHON_3_6,
+                code=aws_lambda.Code.from_asset(
+                    path=project_path,
+                    bundling=core.BundlingOptions(
+                        image=aws_lambda.Runtime.PYTHON_3_6.bundling_docker_image,  # pylint:disable=no-member
+                        command=["bash", "-c", bundler.read()],
+                    ),
                 ),
-            ),
-        )
+            )
         db_datasets_table.add_global_secondary_index(
             index_name="datasets_title",
             partition_key=aws_dynamodb.Attribute(name="sk", type=aws_dynamodb.AttributeType.STRING),

diff --git a/infra/requirements.txt b/infra/requirements.txt