fedspendingtransparency · sethstoudenmier · Oct 18, 2024 · Oct 18, 2024 · Oct 21, 2024 · Oct 21, 2024
diff --git a/.env.template b/.env.template
@@ -34,7 +34,7 @@ USASPENDING_DB_USER=usaspending
 USASPENDING_DB_PASSWORD=usaspender
 
 # All values of BROKER_DB_* must match what is in DATA_BROKER_DATABASE_URL if BOTH are given
-DATA_BROKER_DATABASE_URL=postgres://admin:root@dataact-broker-db:5432/data_store_api
+DATA_BROKER_DATABASE_URL=postgres://admin:root@dataact-broker-db:5432/data_broker
 # Configuration values for a connection string to a Broker database
 #    Only necessary for some management commands
 BROKER_DB_HOST=dataact-broker-db

diff --git a/.gitignore b/.gitignore
@@ -18,7 +18,7 @@ tmp/*
 spark-warehouse/
 derby.log
 
-data/output/
+data/*
 
 # pyenv ignores
 \.python-version

diff --git a/Dockerfile b/Dockerfile
@@ -12,7 +12,7 @@ FROM python:3.8.16-slim-bullseye
 WORKDIR /dockermount
 
 RUN apt update && \
-    apt install -y gcc postgresql-13
+    apt install -y curl gcc postgresql-13
 
 ##### Copy python packaged
 WORKDIR /dockermount

diff --git a/Dockerfile.spark b/Dockerfile.spark
@@ -1,37 +1,24 @@
-FROM centos:7
+FROM python:3.8.16-slim-bullseye
 
 # Build ARGs
-# Forcing back to python 3.8 to be in sync with local dev env.
-# Can't run driver and worker on different Python versions when driver is local dev machine and not spark-submit container
-#ARG PYTHON_VERSION=3.8.10
-ARG PYTHON_VERSION=3.8.16
 ARG HADOOP_VERSION=3.3.1
 ARG SPARK_VERSION=3.2.1
 ARG PROJECT_LOG_DIR=/logs
 
-RUN yum -y update && yum clean all
-# sqlite-devel added as prerequisite for coverage python lib, used by pytest-cov plugin
-RUN yum -y install wget gcc openssl-devel bzip2-devel libffi libffi-devel zlib-devel sqlite-devel
-RUN yum -y groupinstall "Development Tools"
+# Install dependencies
+RUN apt update && \
+    apt install -y coreutils gcc wget openssl libssl-dev libbz2-dev build-essential
 
-# Building Python 3.x
-WORKDIR /usr/src
-RUN wget --quiet https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \
-    && tar xzf Python-${PYTHON_VERSION}.tgz
-WORKDIR /usr/src/Python-${PYTHON_VERSION}
-RUN ./configure --enable-optimizations \
-    && make altinstall \
-    && ln -sf /usr/local/bin/python`echo ${PYTHON_VERSION} | awk -F. '{short_version=$1 FS $2; print short_version}'` /usr/bin/python3 \
-    && echo "Installed $(python3 --version)"
-# Ensure Python STDOUT gets sent to container logs
-ENV PYTHONUNBUFFERED=1
-
-# Install Java 1.8.x
-RUN yum -y install java-1.8.0-openjdk
-ENV JAVA_HOME=/usr/lib/jvm/jre
+# Install Amazon Corretto 8 (a Long-Term Supported (LTS) distribution of OpenJDK 8)
+RUN wget -qO - https://apt.corretto.aws/corretto.key | gpg --dearmor -o /usr/share/keyrings/corretto-keyring.gpg && \
+    echo "deb [signed-by=/usr/share/keyrings/corretto-keyring.gpg] https://apt.corretto.aws stable main" | tee /etc/apt/sources.list.d/corretto.list
+RUN apt update && \
+    apt install -y java-1.8.0-amazon-corretto-jdk
+ENV JAVA_HOME=/usr/lib/jvm/java-1.8.0-amazon-corretto
 
 # Install Hadoop and Spark
 WORKDIR /usr/local
+
 RUN wget --quiet https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz \
     && tar xzf hadoop-${HADOOP_VERSION}.tar.gz \
     && ln -sfn /usr/local/hadoop-${HADOOP_VERSION} /usr/local/hadoop \
@@ -43,7 +30,7 @@ ENV HADOOP_HOME=/usr/local/hadoop
 ENV SPARK_HOME=/usr/local/spark
 # Cannot set ENV var = command-result, [i.e. doing: ENV SPARK_DIST_CLASSPATH=$(${HADOOP_HOME}/bin/hadoop classpath)], so interpolating the hadoop classpath the long way
 ENV SPARK_DIST_CLASSPATH="$HADOOP_HOME/etc/hadoop/*:$HADOOP_HOME/share/hadoop/common/lib/*:$HADOOP_HOME/share/hadoop/common/*:$HADOOP_HOME/share/hadoop/hdfs/*:$HADOOP_HOME/share/hadoop/hdfs/lib/*:$HADOOP_HOME/share/hadoop/hdfs/*:$HADOOP_HOME/share/hadoop/yarn/lib/*:$HADOOP_HOME/share/hadoop/yarn/*:$HADOOP_HOME/share/hadoop/mapreduce/lib/*:$HADOOP_HOME/share/hadoop/mapreduce/*:$HADOOP_HOME/share/hadoop/tools/lib/*"
-ENV PATH=${SPARK_HOME}/bin:${HADOOP_HOME}/bin:${JAVA_HOME}/bin:${PATH};
+ENV PATH=${SPARK_HOME}/bin:${HADOOP_HOME}/bin:${JAVA_HOME}/bin:${PATH}
 RUN echo "Installed Spark" && echo "$(${SPARK_HOME}/bin/pyspark --version)"
 
 # Config for starting up the Spark History Server

diff --git a/Dockerfile.testing b/Dockerfile.testing
@@ -8,19 +8,19 @@
 
 FROM usaspending-backend:latest
 
-WORKDIR /usaspending-api
-
-# Install Java for PySpark
-RUN yum install -y java-1.8.0-openjdk-devel
-RUN export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.382.b05-1.el7_9.x86_64
-RUN export PATH=$JAVA_HOME/bin:$PATHv
+# Install dependencies
+RUN apt update && \
+    apt install -y build-essential coreutils wget
+
+# Install Amazon Corretto 8 (a Long-Term Supported (LTS) distribution of OpenJDK 8)
+RUN wget -qO - https://apt.corretto.aws/corretto.key | gpg --dearmor -o /usr/share/keyrings/corretto-keyring.gpg && \
+    echo "deb [signed-by=/usr/share/keyrings/corretto-keyring.gpg] https://apt.corretto.aws stable main" | tee /etc/apt/sources.list.d/corretto.list
+RUN apt update && \
+    apt install -y java-1.8.0-amazon-corretto-jdk
+RUN export JAVA_HOME=/usr/lib/jvm/java-1.8.0-amazon-corretto
+RUN export PATH=${JAVA_HOME}/bin:$PATH
 
-# Prevent a Pytest RuntimeError where it expects the parent directory to be the name of an installed Django app
-# This will cause a RuntimeError because `dockermount` isn't the name of an installed Django app, so we move the files
-#   to a `/usaspending-api` parent directory which is the name of an installed Django app.
-RUN mv /dockermount/* /usaspending-api/
-
-# Copy .env.template file for config tests
-COPY .env.template /usaspending-api/
+WORKDIR /usaspending-api
+COPY requirements/requirements-dev.txt requirements/requirements-dev.txt
 
 RUN python3 -m pip install -r requirements/requirements-dev.txt
diff --git a/Makefile b/Makefile
@@ -161,51 +161,51 @@ endif
 
 
 .PHONY: docker-compose
-docker-compose: ## Run an arbitrary docker-compose command by passing in the Docker Compose profiles in the "profiles" variable, and args in the "args" variable
+docker-compose: ## Run an arbitrary docker compose command by passing in the Docker Compose profiles in the "profiles" variable, and args in the "args" variable
 	# NOTE: The .env file is used to provide environment variable values that replace variables in the compose file
 	#       Because the .env file does not live in the same place as the compose file, we have to tell compose explicitly
 	#       where it is with "--project_directory". Since this is called from the root Makefile, using ./ points to the dir
 	#       of that Makefile
-	docker-compose ${profiles} --project-directory . --file ${docker_compose_file} ${args}
+	docker compose ${profiles} --project-directory . --file ${docker_compose_file} ${args}
 
 .PHONY: docker-compose-config
-docker-compose-config:  ## Show config and vars expanded, which will be used in docker-compose
+docker-compose-config:  ## Show config and vars expanded, which will be used in docker compose
 	# NOTE: The .env file is used to provide environment variable values that replace varialbes in the compose file
 	#       Because the .env file does not live in the same place as the compose file, we have to tell compose explicitly
 	#       where it is with "--project_directory". Since this is called from teh root Makefile, using ./ points to the dir
 	#       of that Makefile
-	docker-compose --project-directory . --file ${docker_compose_file} config ${args}
+	docker compose --project-directory . --file ${docker_compose_file} config ${args}
 
 .PHONY: docker-compose-up-usaspending
-docker-compose-up-usaspending: ## Deploy containerized version of this app on the local machine using docker-compose
-	# To 'up' a single docker-compose service, pass it in the args var, e.g.: make deploy-docker args=my-service
-	# NOTE: [See NOTE in docker-compose rule about .env file]
-	docker-compose --profile usaspending --project-directory . --file ${docker_compose_file} up ${args}
+docker-compose-up-usaspending: ## Deploy containerized version of this app on the local machine using docker compose
+	# To 'up' a single docker compose service, pass it in the args var, e.g.: make deploy-docker args=my-service
+	# NOTE: [See NOTE in docker compose rule about .env file]
+	docker compose --profile usaspending --project-directory . --file ${docker_compose_file} up ${args}
 
 .PHONY: docker-compose-up-s3
-docker-compose-up-s3: ## Deploy minio container on the local machine using docker-compose, which acts as a look-alike AWS S3 service
-	# NOTE: [See NOTE in docker-compose rule about .env file]
-	echo "docker-compose --profile s3 --project-directory . --file ${docker_compose_file} up ${args}"
-	docker-compose --profile s3 --project-directory . --file ${docker_compose_file} up ${args}
+docker-compose-up-s3: ## Deploy minio container on the local machine using docker compose, which acts as a look-alike AWS S3 service
+	# NOTE: [See NOTE in docker compose rule about .env file]
+	echo "docker compose --profile s3 --project-directory . --file ${docker_compose_file} up ${args}"
+	docker compose --profile s3 --project-directory . --file ${docker_compose_file} up ${args}
 
 .PHONY: docker-compose-up-spark
-docker-compose-up-spark: ## Deploy containerized version of spark cluster infrastructure on the local machine using docker-compose
-	# NOTE: [See NOTE in docker-compose rule about .env file]
-	docker-compose --profile spark --project-directory . --file ${docker_compose_file} up ${args}
+docker-compose-up-spark: ## Deploy containerized version of spark cluster infrastructure on the local machine using docker compose
+	# NOTE: [See NOTE in docker compose rule about .env file]
+	docker compose --profile spark --project-directory . --file ${docker_compose_file} up ${args}
 
 .PHONY: docker-compose-run
-docker-compose-run: ## Use docker-compose run <args> to run one or more Docker Compose services with options
-	# NOTE: [See NOTE in docker-compose rule about .env file]
-	docker-compose ${profiles} --project-directory . --file ${docker_compose_file} run ${args}
+docker-compose-run: ## Use docker compose run <args> to run one or more Docker Compose services with options
+	# NOTE: [See NOTE in docker compose rule about .env file]
+	docker compose ${profiles} --project-directory . --file ${docker_compose_file} run ${args}
 
 .PHONY: docker-compose-down
-docker-compose-down: ## Run docker-compose down to bring down services listed in the compose file
-	# NOTE: [See NOTE in docker-compose rule about .env file]
-	docker-compose --project-directory . --file ${docker_compose_file} down ${args}
+docker-compose-down: ## Run docker compose down to bring down services listed in the compose file
+	# NOTE: [See NOTE in docker compose rule about .env file]
+	docker compose --project-directory . --file ${docker_compose_file} down ${args}
 
 .PHONY: docker-build-spark
 docker-build-spark: ## Run docker build to build a base container image for spark, hadoop, and python installed
-	# NOTE: [See NOTE in above docker-compose rule about .env file]
+	# NOTE: [See NOTE in above docker compose rule about .env file]
 	echo "docker build --tag spark-base --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args} --file ${dockerfile_for_spark} $$(dirname ${dockerfile_for_spark})"
 	docker build --tag spark-base --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args} --file ${dockerfile_for_spark} $$(dirname ${dockerfile_for_spark})
 
@@ -214,24 +214,24 @@ docker-compose-build: ## Ensure ALL services in the docker-compose.yaml file hav
 	# NOTE: This *may* creates a compose-specific image name IF an image: YAML key does not specify the image name to be used as
 	#       a tag when compose has to build the image.
 	#       If no image key is specified, then be aware that:
-	#       While building and tagging the spark-base image can be done, docker-compose will _NOT USE_ that image at runtime,
+	#       While building and tagging the spark-base image can be done, docker compose will _NOT USE_ that image at runtime,
 	#       but look for an image with its custom tag. It may use cached layers of that image when doing its build,
 	#       but it will create a _differently named_ image: the image name is always going to be <project>_<service>,
 	#       where project defaults to the directory name you're in. Therefore you MUST always run this command (or the manual version of it)
 	#       anytime you want services run with Docker Compose to accommodate recent changes in the image (e.g. python package dependency changes)
-	# NOTE: [See NOTE in above docker-compose rule about .env file]
-	echo "docker-compose --profile usaspending --project-directory . --file ${docker_compose_file} build --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args}"
-	docker-compose --profile usaspending --project-directory . --file ${docker_compose_file} build --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args}
+	# NOTE: [See NOTE in above docker compose rule about .env file]
+	echo "docker compose --profile usaspending --project-directory . --file ${docker_compose_file} build --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args}"
+	docker compose --profile usaspending --project-directory . --file ${docker_compose_file} build --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args}
 
 .PHONY: docker-compose-build-spark
 docker-compose-build-spark: ## See: docker-compose-build rule. This builds just the subset of spark services.
-	# NOTE: [See NOTE in above docker-compose rule about .env file]=
-	echo "docker-compose --profile spark --project-directory . --file ${docker_compose_file} build --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args}"
-	docker-compose --profile spark --project-directory . --file ${docker_compose_file} build --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args}
+	# NOTE: [See NOTE in above docker compose rule about .env file]=
+	echo "docker compose --profile spark --project-directory . --file ${docker_compose_file} build --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args}"
+	docker compose --profile spark --project-directory . --file ${docker_compose_file} build --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args}
 
 .PHONY: docker-compose-spark-submit
 docker-compose-spark-submit: ## Run spark-submit from within local docker containerized infrastructure (which must be running first). Set params with django_command="..."
-	docker-compose --profile=spark --project-directory . --file ${docker_compose_file} run \
+	docker compose --profile=spark --project-directory . --file ${docker_compose_file} run \
 		-e MINIO_HOST=minio \
 		-e COMPONENT_NAME='${django_command}${python_script}' \
 		-e DATABASE_URL=${DATABASE_URL} \
@@ -266,5 +266,5 @@ pyspark-shell: ## Launch a local pyspark REPL shell with all of the packages and
 	--conf spark.hadoop.fs.s3a.connection.ssl.enabled=false \
 	--conf spark.hadoop.fs.s3a.path.style.access=true \
 	--conf spark.sql.catalogImplementation=hive \
-	--conf spark.sql.warehouse.dir='$(PWD)/spark-warehouse' \
-	--conf spark.hadoop.javax.jdo.option.ConnectionURL='jdbc:derby:;databaseName=$(PWD)/spark-warehouse/metastore_db;create=true'
+	--conf spark.sql.warehouse.dir='./spark-warehouse' \
+	--conf spark.hadoop.javax.jdo.option.ConnectionURL='jdbc:derby:;databaseName=./spark-warehouse/metastore_db;create=true'