Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updating README and local config #4211

Open
wants to merge 12 commits into
base: qat
Choose a base branch
from
Open
2 changes: 1 addition & 1 deletion .env.template
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ USASPENDING_DB_USER=usaspending
USASPENDING_DB_PASSWORD=usaspender

# All values of BROKER_DB_* must match what is in DATA_BROKER_DATABASE_URL if BOTH are given
DATA_BROKER_DATABASE_URL=postgres://admin:root@dataact-broker-db:5432/data_store_api
DATA_BROKER_DATABASE_URL=postgres://admin:root@dataact-broker-db:5432/data_broker
# Configuration values for a connection string to a Broker database
# Only necessary for some management commands
BROKER_DB_HOST=dataact-broker-db
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ tmp/*
spark-warehouse/
derby.log

data/output/
data/*

# pyenv ignores
\.python-version
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ FROM python:3.8.16-slim-bullseye
WORKDIR /dockermount

RUN apt update && \
apt install -y gcc postgresql-13
apt install -y curl gcc postgresql-13

##### Copy python packaged
WORKDIR /dockermount
Expand Down
37 changes: 12 additions & 25 deletions Dockerfile.spark
Original file line number Diff line number Diff line change
@@ -1,37 +1,24 @@
FROM centos:7
FROM python:3.8.16-slim-bullseye

# Build ARGs
# Forcing back to python 3.8 to be in sync with local dev env.
# Can't run driver and worker on different Python versions when driver is local dev machine and not spark-submit container
#ARG PYTHON_VERSION=3.8.10
ARG PYTHON_VERSION=3.8.16
ARG HADOOP_VERSION=3.3.1
ARG SPARK_VERSION=3.2.1
ARG PROJECT_LOG_DIR=/logs

RUN yum -y update && yum clean all
# sqlite-devel added as prerequisite for coverage python lib, used by pytest-cov plugin
RUN yum -y install wget gcc openssl-devel bzip2-devel libffi libffi-devel zlib-devel sqlite-devel
RUN yum -y groupinstall "Development Tools"
# Install dependencies
RUN apt update && \
apt install -y coreutils gcc wget openssl libssl-dev libbz2-dev build-essential

# Building Python 3.x
WORKDIR /usr/src
RUN wget --quiet https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \
&& tar xzf Python-${PYTHON_VERSION}.tgz
WORKDIR /usr/src/Python-${PYTHON_VERSION}
RUN ./configure --enable-optimizations \
&& make altinstall \
&& ln -sf /usr/local/bin/python`echo ${PYTHON_VERSION} | awk -F. '{short_version=$1 FS $2; print short_version}'` /usr/bin/python3 \
&& echo "Installed $(python3 --version)"
# Ensure Python STDOUT gets sent to container logs
ENV PYTHONUNBUFFERED=1

# Install Java 1.8.x
RUN yum -y install java-1.8.0-openjdk
ENV JAVA_HOME=/usr/lib/jvm/jre
# Install Amazon Corretto 8 (a Long-Term Supported (LTS) distribution of OpenJDK 8)
RUN wget -qO - https://apt.corretto.aws/corretto.key | gpg --dearmor -o /usr/share/keyrings/corretto-keyring.gpg && \
echo "deb [signed-by=/usr/share/keyrings/corretto-keyring.gpg] https://apt.corretto.aws stable main" | tee /etc/apt/sources.list.d/corretto.list
RUN apt update && \
apt install -y java-1.8.0-amazon-corretto-jdk
ENV JAVA_HOME=/usr/lib/jvm/java-1.8.0-amazon-corretto

# Install Hadoop and Spark
WORKDIR /usr/local

RUN wget --quiet https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz \
&& tar xzf hadoop-${HADOOP_VERSION}.tar.gz \
&& ln -sfn /usr/local/hadoop-${HADOOP_VERSION} /usr/local/hadoop \
Expand All @@ -43,7 +30,7 @@ ENV HADOOP_HOME=/usr/local/hadoop
ENV SPARK_HOME=/usr/local/spark
# Cannot set ENV var = command-result, [i.e. doing: ENV SPARK_DIST_CLASSPATH=$(${HADOOP_HOME}/bin/hadoop classpath)], so interpolating the hadoop classpath the long way
ENV SPARK_DIST_CLASSPATH="$HADOOP_HOME/etc/hadoop/*:$HADOOP_HOME/share/hadoop/common/lib/*:$HADOOP_HOME/share/hadoop/common/*:$HADOOP_HOME/share/hadoop/hdfs/*:$HADOOP_HOME/share/hadoop/hdfs/lib/*:$HADOOP_HOME/share/hadoop/hdfs/*:$HADOOP_HOME/share/hadoop/yarn/lib/*:$HADOOP_HOME/share/hadoop/yarn/*:$HADOOP_HOME/share/hadoop/mapreduce/lib/*:$HADOOP_HOME/share/hadoop/mapreduce/*:$HADOOP_HOME/share/hadoop/tools/lib/*"
ENV PATH=${SPARK_HOME}/bin:${HADOOP_HOME}/bin:${JAVA_HOME}/bin:${PATH};
ENV PATH=${SPARK_HOME}/bin:${HADOOP_HOME}/bin:${JAVA_HOME}/bin:${PATH}
RUN echo "Installed Spark" && echo "$(${SPARK_HOME}/bin/pyspark --version)"

# Config for starting up the Spark History Server
Expand Down
26 changes: 13 additions & 13 deletions Dockerfile.testing
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@

FROM usaspending-backend:latest

WORKDIR /usaspending-api

# Install Java for PySpark
RUN yum install -y java-1.8.0-openjdk-devel
RUN export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.382.b05-1.el7_9.x86_64
RUN export PATH=$JAVA_HOME/bin:$PATHv
# Install dependencies
RUN apt update && \
apt install -y build-essential coreutils wget

# Install Amazon Corretto 8 (a Long-Term Supported (LTS) distribution of OpenJDK 8)
RUN wget -qO - https://apt.corretto.aws/corretto.key | gpg --dearmor -o /usr/share/keyrings/corretto-keyring.gpg && \
echo "deb [signed-by=/usr/share/keyrings/corretto-keyring.gpg] https://apt.corretto.aws stable main" | tee /etc/apt/sources.list.d/corretto.list
RUN apt update && \
apt install -y java-1.8.0-amazon-corretto-jdk
RUN export JAVA_HOME=/usr/lib/jvm/java-1.8.0-amazon-corretto
RUN export PATH=${JAVA_HOME}/bin:$PATH

# Prevent a Pytest RuntimeError where it expects the parent directory to be the name of an installed Django app
# This will cause a RuntimeError because `dockermount` isn't the name of an installed Django app, so we move the files
# to a `/usaspending-api` parent directory which is the name of an installed Django app.
RUN mv /dockermount/* /usaspending-api/

# Copy .env.template file for config tests
COPY .env.template /usaspending-api/
WORKDIR /usaspending-api
COPY requirements/requirements-dev.txt requirements/requirements-dev.txt

RUN python3 -m pip install -r requirements/requirements-dev.txt
64 changes: 32 additions & 32 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -161,51 +161,51 @@ endif


.PHONY: docker-compose
docker-compose: ## Run an arbitrary docker-compose command by passing in the Docker Compose profiles in the "profiles" variable, and args in the "args" variable
docker-compose: ## Run an arbitrary docker compose command by passing in the Docker Compose profiles in the "profiles" variable, and args in the "args" variable
# NOTE: The .env file is used to provide environment variable values that replace variables in the compose file
# Because the .env file does not live in the same place as the compose file, we have to tell compose explicitly
# where it is with "--project_directory". Since this is called from the root Makefile, using ./ points to the dir
# of that Makefile
docker-compose ${profiles} --project-directory . --file ${docker_compose_file} ${args}
docker compose ${profiles} --project-directory . --file ${docker_compose_file} ${args}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did we remove the - between docker compose? I have historically always used docker-compose. Is there a difference? For example: docker-compose up usaspending-db usaspending-es

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ayubshahab Recent updates to docker have added compose as a sub-command of docker itself instead of being a standalone executable.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


.PHONY: docker-compose-config
docker-compose-config: ## Show config and vars expanded, which will be used in docker-compose
docker-compose-config: ## Show config and vars expanded, which will be used in docker compose
# NOTE: The .env file is used to provide environment variable values that replace varialbes in the compose file
# Because the .env file does not live in the same place as the compose file, we have to tell compose explicitly
# where it is with "--project_directory". Since this is called from teh root Makefile, using ./ points to the dir
# of that Makefile
docker-compose --project-directory . --file ${docker_compose_file} config ${args}
docker compose --project-directory . --file ${docker_compose_file} config ${args}

.PHONY: docker-compose-up-usaspending
docker-compose-up-usaspending: ## Deploy containerized version of this app on the local machine using docker-compose
# To 'up' a single docker-compose service, pass it in the args var, e.g.: make deploy-docker args=my-service
# NOTE: [See NOTE in docker-compose rule about .env file]
docker-compose --profile usaspending --project-directory . --file ${docker_compose_file} up ${args}
docker-compose-up-usaspending: ## Deploy containerized version of this app on the local machine using docker compose
# To 'up' a single docker compose service, pass it in the args var, e.g.: make deploy-docker args=my-service
# NOTE: [See NOTE in docker compose rule about .env file]
docker compose --profile usaspending --project-directory . --file ${docker_compose_file} up ${args}

.PHONY: docker-compose-up-s3
docker-compose-up-s3: ## Deploy minio container on the local machine using docker-compose, which acts as a look-alike AWS S3 service
# NOTE: [See NOTE in docker-compose rule about .env file]
echo "docker-compose --profile s3 --project-directory . --file ${docker_compose_file} up ${args}"
docker-compose --profile s3 --project-directory . --file ${docker_compose_file} up ${args}
docker-compose-up-s3: ## Deploy minio container on the local machine using docker compose, which acts as a look-alike AWS S3 service
# NOTE: [See NOTE in docker compose rule about .env file]
echo "docker compose --profile s3 --project-directory . --file ${docker_compose_file} up ${args}"
docker compose --profile s3 --project-directory . --file ${docker_compose_file} up ${args}

.PHONY: docker-compose-up-spark
docker-compose-up-spark: ## Deploy containerized version of spark cluster infrastructure on the local machine using docker-compose
# NOTE: [See NOTE in docker-compose rule about .env file]
docker-compose --profile spark --project-directory . --file ${docker_compose_file} up ${args}
docker-compose-up-spark: ## Deploy containerized version of spark cluster infrastructure on the local machine using docker compose
# NOTE: [See NOTE in docker compose rule about .env file]
docker compose --profile spark --project-directory . --file ${docker_compose_file} up ${args}

.PHONY: docker-compose-run
docker-compose-run: ## Use docker-compose run <args> to run one or more Docker Compose services with options
# NOTE: [See NOTE in docker-compose rule about .env file]
docker-compose ${profiles} --project-directory . --file ${docker_compose_file} run ${args}
docker-compose-run: ## Use docker compose run <args> to run one or more Docker Compose services with options
# NOTE: [See NOTE in docker compose rule about .env file]
docker compose ${profiles} --project-directory . --file ${docker_compose_file} run ${args}

.PHONY: docker-compose-down
docker-compose-down: ## Run docker-compose down to bring down services listed in the compose file
# NOTE: [See NOTE in docker-compose rule about .env file]
docker-compose --project-directory . --file ${docker_compose_file} down ${args}
docker-compose-down: ## Run docker compose down to bring down services listed in the compose file
# NOTE: [See NOTE in docker compose rule about .env file]
docker compose --project-directory . --file ${docker_compose_file} down ${args}

.PHONY: docker-build-spark
docker-build-spark: ## Run docker build to build a base container image for spark, hadoop, and python installed
# NOTE: [See NOTE in above docker-compose rule about .env file]
# NOTE: [See NOTE in above docker compose rule about .env file]
echo "docker build --tag spark-base --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args} --file ${dockerfile_for_spark} $$(dirname ${dockerfile_for_spark})"
docker build --tag spark-base --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args} --file ${dockerfile_for_spark} $$(dirname ${dockerfile_for_spark})

Expand All @@ -214,24 +214,24 @@ docker-compose-build: ## Ensure ALL services in the docker-compose.yaml file hav
# NOTE: This *may* creates a compose-specific image name IF an image: YAML key does not specify the image name to be used as
# a tag when compose has to build the image.
# If no image key is specified, then be aware that:
# While building and tagging the spark-base image can be done, docker-compose will _NOT USE_ that image at runtime,
# While building and tagging the spark-base image can be done, docker compose will _NOT USE_ that image at runtime,
# but look for an image with its custom tag. It may use cached layers of that image when doing its build,
# but it will create a _differently named_ image: the image name is always going to be <project>_<service>,
# where project defaults to the directory name you're in. Therefore you MUST always run this command (or the manual version of it)
# anytime you want services run with Docker Compose to accommodate recent changes in the image (e.g. python package dependency changes)
# NOTE: [See NOTE in above docker-compose rule about .env file]
echo "docker-compose --profile usaspending --project-directory . --file ${docker_compose_file} build --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args}"
docker-compose --profile usaspending --project-directory . --file ${docker_compose_file} build --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args}
# NOTE: [See NOTE in above docker compose rule about .env file]
echo "docker compose --profile usaspending --project-directory . --file ${docker_compose_file} build --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args}"
docker compose --profile usaspending --project-directory . --file ${docker_compose_file} build --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args}

.PHONY: docker-compose-build-spark
docker-compose-build-spark: ## See: docker-compose-build rule. This builds just the subset of spark services.
# NOTE: [See NOTE in above docker-compose rule about .env file]=
echo "docker-compose --profile spark --project-directory . --file ${docker_compose_file} build --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args}"
docker-compose --profile spark --project-directory . --file ${docker_compose_file} build --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args}
# NOTE: [See NOTE in above docker compose rule about .env file]=
echo "docker compose --profile spark --project-directory . --file ${docker_compose_file} build --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args}"
docker compose --profile spark --project-directory . --file ${docker_compose_file} build --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args}

.PHONY: docker-compose-spark-submit
docker-compose-spark-submit: ## Run spark-submit from within local docker containerized infrastructure (which must be running first). Set params with django_command="..."
docker-compose --profile=spark --project-directory . --file ${docker_compose_file} run \
docker compose --profile=spark --project-directory . --file ${docker_compose_file} run \
-e MINIO_HOST=minio \
-e COMPONENT_NAME='${django_command}${python_script}' \
-e DATABASE_URL=${DATABASE_URL} \
Expand Down Expand Up @@ -266,5 +266,5 @@ pyspark-shell: ## Launch a local pyspark REPL shell with all of the packages and
--conf spark.hadoop.fs.s3a.connection.ssl.enabled=false \
--conf spark.hadoop.fs.s3a.path.style.access=true \
--conf spark.sql.catalogImplementation=hive \
--conf spark.sql.warehouse.dir='$(PWD)/spark-warehouse' \
--conf spark.hadoop.javax.jdo.option.ConnectionURL='jdbc:derby:;databaseName=$(PWD)/spark-warehouse/metastore_db;create=true'
--conf spark.sql.warehouse.dir='./spark-warehouse' \
--conf spark.hadoop.javax.jdo.option.ConnectionURL='jdbc:derby:;databaseName=./spark-warehouse/metastore_db;create=true'
Loading