Skip to content

Commit

Permalink
Do not bloat spark image with ENV variables (#2081)
Browse files Browse the repository at this point in the history
* Do not bloat spark image with ENV variables

* Remove HadoopVersionTagger
  • Loading branch information
mathbunnyru authored Jan 17, 2024
1 parent d57bf95 commit bf33945
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 27 deletions.
12 changes: 5 additions & 7 deletions images/pyspark-notebook/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -34,20 +34,18 @@ ARG scala_version
# But it seems to be slower, that's why we use the recommended site for download
ARG spark_download_url="https://dlcdn.apache.org/spark/"

# Configure Spark
ENV SPARK_VERSION="${spark_version}" \
HADOOP_VERSION="${hadoop_version}" \
SCALA_VERSION="${scala_version}" \
SPARK_DOWNLOAD_URL="${spark_download_url}"

ENV SPARK_HOME=/usr/local/spark
ENV PATH="${PATH}:${SPARK_HOME}/bin"
ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info"

COPY setup_spark.py /opt/setup-scripts/

# Setup Spark
RUN /opt/setup-scripts/setup_spark.py
RUN SPARK_VERSION="${spark_version}" \
HADOOP_VERSION="${hadoop_version}" \
SCALA_VERSION="${scala_version}" \
SPARK_DOWNLOAD_URL="${spark_download_url}" \
/opt/setup-scripts/setup_spark.py

# Configure IPython system-wide
COPY ipython_kernel_config.py "/etc/ipython/"
Expand Down
3 changes: 1 addition & 2 deletions tagging/images_hierarchy.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
)
from tagging.taggers import (
DateTagger,
HadoopVersionTagger,
JavaVersionTagger,
JuliaVersionTagger,
JupyterHubVersionTagger,
Expand Down Expand Up @@ -83,7 +82,7 @@ class ImageDescription:
),
"pyspark-notebook": ImageDescription(
parent_image="scipy-notebook",
taggers=[SparkVersionTagger(), HadoopVersionTagger(), JavaVersionTagger()],
taggers=[SparkVersionTagger(), JavaVersionTagger()],
manifests=[SparkInfoManifest()],
),
"all-spark-notebook": ImageDescription(
Expand Down
18 changes: 0 additions & 18 deletions tagging/taggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,6 @@ def _get_program_version(container: Container, program: str) -> str:
return DockerRunner.run_simple_command(container, cmd=f"{program} --version")


def _get_env_variable(container: Container, variable: str) -> str:
env = DockerRunner.run_simple_command(
container,
cmd="env",
print_result=False,
).split()
for env_entry in env:
if env_entry.startswith(variable):
return env_entry[len(variable) + 1 :]
raise KeyError(variable)


def _get_pip_package_version(container: Container, package: str) -> str:
PIP_VERSION_PREFIX = "Version: "

Expand Down Expand Up @@ -136,12 +124,6 @@ def tag_value(container: Container) -> str:
return "spark-" + version_line.split(" ")[-1]


class HadoopVersionTagger(TaggerInterface):
@staticmethod
def tag_value(container: Container) -> str:
return "hadoop-" + _get_env_variable(container, "HADOOP_VERSION")


class JavaVersionTagger(TaggerInterface):
@staticmethod
def tag_value(container: Container) -> str:
Expand Down

0 comments on commit bf33945

Please sign in to comment.