cal-itp · atvaccaro · Jun 1, 2023 · May 31, 2023 · May 31, 2023 · Jun 1, 2023
@@ -1,5 +1,8 @@
 .user.yml
 
+Miniconda3-py39_4.10.3-Linux-x86_64.sh
+spark-bigquery-with-dependencies_2.12-0.22.2.jar
+
 target/
 dbt_packages/
 logs/
@@ -0,0 +1,118 @@
+# from https://cloud.google.com/dataproc-serverless/docs/guides/custom-containers#example_custom_container_image_build
+
+# Debian 11 is recommended.
+FROM debian:11-slim
+
+LABEL org.opencontainers.image.source https://github.com/cal-itp/data-infra
+
+# Suppress interactive prompts
+ENV DEBIAN_FRONTEND=noninteractive
+
+# (Required) Install utilities required by Spark scripts.
+RUN apt update && apt install -y procps tini libjemalloc2
+
+# Enable jemalloc2 as default memory allocator
+ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2
+
+# (Optional) Add extra jars.
+#ENV SPARK_EXTRA_JARS_DIR=/opt/spark/jars/
+#ENV SPARK_EXTRA_CLASSPATH='/opt/spark/jars/*'
+#RUN mkdir -p "${SPARK_EXTRA_JARS_DIR}"
+#COPY spark-bigquery-with-dependencies_2.12-0.22.2.jar "${SPARK_EXTRA_JARS_DIR}"
+
+# (Optional) Install and configure Miniconda3.
+ENV CONDA_HOME=/opt/miniconda3
+ENV PYSPARK_PYTHON=${CONDA_HOME}/bin/python
+ENV PATH=${CONDA_HOME}/bin:${PATH}
+COPY Miniconda3-py39_4.10.3-Linux-x86_64.sh .
+RUN bash Miniconda3-py39_4.10.3-Linux-x86_64.sh -b -p /opt/miniconda3 \
+  && ${CONDA_HOME}/bin/conda config --system --set always_yes True \
+  && ${CONDA_HOME}/bin/conda config --system --set auto_update_conda False \
+  && ${CONDA_HOME}/bin/conda config --system --prepend channels conda-forge \
+  && ${CONDA_HOME}/bin/conda config --system --set channel_priority strict
+
+# (Optional) Install Conda packages.
+#
+# The following packages are installed in the default image, it is strongly
+# recommended to include all of them.
+#
+# Use mamba to install packages quickly.
+RUN ${CONDA_HOME}/bin/conda install mamba -n base -c conda-forge \
+    && ${CONDA_HOME}/bin/mamba install \
+      conda \
+      cython \
+      fastavro \
+      fastparquet \
+      gcsfs \
+      google-cloud-bigquery-storage \
+      google-cloud-bigquery[pandas] \
+      google-cloud-bigtable \
+      google-cloud-container \
+      google-cloud-datacatalog \
+      google-cloud-dataproc \
+      google-cloud-datastore \
+      google-cloud-language \
+      google-cloud-logging \
+      google-cloud-monitoring \
+      google-cloud-pubsub \
+      google-cloud-redis \
+      google-cloud-spanner \
+      google-cloud-speech \
+      google-cloud-storage \
+      google-cloud-texttospeech \
+      google-cloud-translate \
+      google-cloud-vision \
+      koalas \
+      matplotlib \
+      nltk \
+      numba \
+      numpy \
+      openblas \
+      orc \
+      pandas \
+      pyarrow \
+      pysal \
+      pytables \
+      python \
+      regex \
+      requests \
+      rtree \
+      scikit-image \
+      scikit-learn \
+      scipy \
+      seaborn \
+      sqlalchemy \
+      sympy \
+      virtualenv \
+      shapely==1.8.5.post1
+
+# (Optional) Add extra Python modules.
+#ENV PYTHONPATH=/opt/python/packages
+#RUN mkdir -p "${PYTHONPATH}"
+#COPY test_util.py "${PYTHONPATH}"
+
+# (Optional) Install R and R libraries.
+# The key ID sometimes needs to be updated; the error output will reference the current key ID to import
+#RUN apt update \
+#  && apt install -y gnupg \
+#  && apt-key adv --no-tty \
+#      --keyserver "hkp://keyserver.ubuntu.com:80" \
+#      --recv-keys B8F25A8A73EACF41 \
+#  && echo "deb http://cloud.r-project.org/bin/linux/debian bullseye-cran40/" \
+#      >/etc/apt/sources.list.d/cran-r.list \
+#  && apt update \
+#  && apt install -y \
+#      libopenblas-base \
+#      libssl-dev \
+#      r-base \
+#      r-base-dev \
+#      r-recommended \
+#      r-cran-blob
+#
+#ENV R_HOME=/usr/lib/R
+
+# (Required) Create the 'spark' group/user.
+# The GID and UID must be 1099. Home directory is required.
+RUN groupadd -g 1099 spark
+RUN useradd -u 1099 -g 1099 -d /home/spark -m spark
+USER spark