cal-itp · atvaccaro · Jun 1, 2023 · May 31, 2023 · May 31, 2023 · Jun 1, 2023
@@ -1,5 +1,8 @@
 .user.yml
 
+Miniconda3-py39_4.10.3-Linux-x86_64.sh
+spark-bigquery-with-dependencies_2.12-0.22.2.jar
+
 target/
 dbt_packages/
 logs/
@@ -0,0 +1,118 @@
+# from https://cloud.google.com/dataproc-serverless/docs/guides/custom-containers#example_custom_container_image_build
+
+# Debian 11 is recommended.
+FROM debian:11-slim
+
+LABEL org.opencontainers.image.source https://github.com/cal-itp/data-infra
+
+# Suppress interactive prompts
+ENV DEBIAN_FRONTEND=noninteractive
+
+# (Required) Install utilities required by Spark scripts.
+RUN apt update && apt install -y procps tini libjemalloc2
+
+# Enable jemalloc2 as default memory allocator
+ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2
+
+# (Optional) Add extra jars.
+#ENV SPARK_EXTRA_JARS_DIR=/opt/spark/jars/
+#ENV SPARK_EXTRA_CLASSPATH='/opt/spark/jars/*'
+#RUN mkdir -p "${SPARK_EXTRA_JARS_DIR}"
+#COPY spark-bigquery-with-dependencies_2.12-0.22.2.jar "${SPARK_EXTRA_JARS_DIR}"
+
+# (Optional) Install and configure Miniconda3.
+ENV CONDA_HOME=/opt/miniconda3
+ENV PYSPARK_PYTHON=${CONDA_HOME}/bin/python
+ENV PATH=${CONDA_HOME}/bin:${PATH}
+COPY Miniconda3-py39_4.10.3-Linux-x86_64.sh .
+RUN bash Miniconda3-py39_4.10.3-Linux-x86_64.sh -b -p /opt/miniconda3 \
+  && ${CONDA_HOME}/bin/conda config --system --set always_yes True \
+  && ${CONDA_HOME}/bin/conda config --system --set auto_update_conda False \
+  && ${CONDA_HOME}/bin/conda config --system --prepend channels conda-forge \
+  && ${CONDA_HOME}/bin/conda config --system --set channel_priority strict
+
+# (Optional) Install Conda packages.
+#
+# The following packages are installed in the default image, it is strongly
+# recommended to include all of them.
+#
+# Use mamba to install packages quickly.
+RUN ${CONDA_HOME}/bin/conda install mamba -n base -c conda-forge \
+    && ${CONDA_HOME}/bin/mamba install \
+      conda \
+      cython \
+      fastavro \
+      fastparquet \
+      gcsfs \
+      google-cloud-bigquery-storage \
+      google-cloud-bigquery[pandas] \
+      google-cloud-bigtable \
+      google-cloud-container \
+      google-cloud-datacatalog \
+      google-cloud-dataproc \
+      google-cloud-datastore \
+      google-cloud-language \
+      google-cloud-logging \
+      google-cloud-monitoring \
+      google-cloud-pubsub \
+      google-cloud-redis \
+      google-cloud-spanner \
+      google-cloud-speech \
+      google-cloud-storage \
+      google-cloud-texttospeech \
+      google-cloud-translate \
+      google-cloud-vision \
+      koalas \
+      matplotlib \
+      nltk \
+      numba \
+      numpy \
+      openblas \
+      orc \
+      pandas \
+      pyarrow \
+      pysal \
+      pytables \
+      python \
+      regex \
+      requests \
+      rtree \
+      scikit-image \
+      scikit-learn \
+      scipy \
+      seaborn \
+      sqlalchemy \
+      sympy \
+      virtualenv \
+      shapely==1.8.5.post1
+
+# (Optional) Add extra Python modules.
+#ENV PYTHONPATH=/opt/python/packages
+#RUN mkdir -p "${PYTHONPATH}"
+#COPY test_util.py "${PYTHONPATH}"
+
+# (Optional) Install R and R libraries.
+# The key ID sometimes needs to be updated; the error output will reference the current key ID to import
+#RUN apt update \
+#  && apt install -y gnupg \
+#  && apt-key adv --no-tty \
+#      --keyserver "hkp://keyserver.ubuntu.com:80" \
+#      --recv-keys B8F25A8A73EACF41 \
+#  && echo "deb http://cloud.r-project.org/bin/linux/debian bullseye-cran40/" \
+#      >/etc/apt/sources.list.d/cran-r.list \
+#  && apt update \
+#  && apt install -y \
+#      libopenblas-base \
+#      libssl-dev \
+#      r-base \
+#      r-base-dev \
+#      r-recommended \
+#      r-cran-blob
+#
+#ENV R_HOME=/usr/lib/R
+
+# (Required) Create the 'spark' group/user.
+# The GID and UID must be 1099. Home directory is required.
+RUN groupadd -g 1099 spark
+RUN useradd -u 1099 -g 1099 -d /home/spark -m spark
+USER spark
@@ -186,6 +186,29 @@ If you prefer to install dbt locally and use your own development environment, y
    3. `brew link python@3.9`
    4. After restarting the terminal, confirm with `python3 --version` and retry `poetry install`
 
+### Dataproc configuration
+
+> If you are not using Python models or are just using the existing Dataproc configuration, you can ignore this section.
+
+[dbt docs](https://docs.getdbt.com/docs/build/python-models) exist for setting up Python models in general, as well as the specific steps required to configure BigQuery/Dataproc.
+
+The default profile template specifies `gcr.io/cal-itp-data-infra/dbt-spark:<date_tag>` as the custom image for
+Dataproc batch jobs. This image is built and pushed via the following; note that the image is hosted on Google
+Container Registry (`gcr.io`) not GitHub Container Registry (`ghcr.io`). This will need to be migrated to Google Artifact Repository
+at some point in the future, as it is replacing GCR.
+
+```bash
+docker build -f Dockerfile.spark -t gcr.io/cal-itp/data-infra/dbt-spark:2023.3.28
+docker push gcr.io/cal-itp-data-infra/dbt-spark:2023.3.28
+```
+
+Dockerfile.spark is based on the [example provided by Google in their Dataproc Serverless documentation](https://cloud.google.com/dataproc-serverless/docs/guides/custom-containers#example_custom_container_image_build).
+It references two files that are copied from local into the image; links are provided as comments for downloading these if the image needs to be re-built.
+
+In addition to the steps specified in the dbt docs, [Google Private Access was enabled on our default VPC](https://cloud.google.com/vpc/docs/configure-private-google-access#enabling-pga)
+and the cal-itp-data-infra-staging project's default service account (`473674835135-compute@developer.gserviceaccount.com`) was granted access to the production project
+since the buckets for compiled Python models (`gs://calitp-dbt-python-models` and `gs://test-calitp-dbt-python-models`)
+as well as external tables exist in the production project.
 
 ### Troubleshooting