Skip to content

Commit

Permalink
chore: downgrade to spark3.3
Browse files Browse the repository at this point in the history
  • Loading branch information
BrendanWalsh authored and mhamilton723 committed Aug 30, 2024
1 parent 392f601 commit d3bbd47
Show file tree
Hide file tree
Showing 21 changed files with 62 additions and 75 deletions.
15 changes: 5 additions & 10 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ import scala.xml.transform.{RewriteRule, RuleTransformer}
import scala.xml.{Node => XmlNode, NodeSeq => XmlNodeSeq, _}

val condaEnvName = "synapseml"
val sparkVersion = "3.4.1"
val sparkVersion = "3.3.3"
name := "synapseml"
ThisBuild / organization := "com.microsoft.azure"
ThisBuild / scalaVersion := "2.12.17"
ThisBuild / scalaVersion := "2.12.15"

val scalaMajorVersion = 2.12

Expand All @@ -20,27 +20,22 @@ val excludes = Seq(
)

val coreDependencies = Seq(
// Excluding protobuf-java, as spark-core is bringing the older version transitively.
"org.apache.spark" %% "spark-core" % sparkVersion % "compile" exclude("com.google.protobuf", "protobuf-java"),
"org.apache.spark" %% "spark-core" % sparkVersion % "compile",
"org.apache.spark" %% "spark-mllib" % sparkVersion % "compile",
"org.apache.spark" %% "spark-avro" % sparkVersion % "compile",
"org.apache.spark" %% "spark-avro" % sparkVersion % "provided",
"org.apache.spark" %% "spark-tags" % sparkVersion % "test",
"com.globalmentor" % "hadoop-bare-naked-local-fs" % "0.1.0" % "test",
"org.scalatest" %% "scalatest" % "3.2.14" % "test")
val extraDependencies = Seq(
"commons-lang" % "commons-lang" % "2.6",
"org.scalactic" %% "scalactic" % "3.2.14",
"io.spray" %% "spray-json" % "1.3.5",
"com.jcraft" % "jsch" % "0.1.54",
"org.apache.httpcomponents.client5" % "httpclient5" % "5.1.3",
"org.apache.httpcomponents" % "httpmime" % "4.5.13",
"com.linkedin.isolation-forest" %% "isolation-forest_3.4.2" % "3.0.4"
"com.linkedin.isolation-forest" %% "isolation-forest_3.3.3" % "3.0.4"
exclude("com.google.protobuf", "protobuf-java") exclude("org.apache.spark", "spark-mllib_2.12")
exclude("org.apache.spark", "spark-core_2.12") exclude("org.apache.spark", "spark-avro_2.12")
exclude("org.apache.spark", "spark-sql_2.12"),
// Although breeze 2.1.0 is already provided by Spark, this is needed for Azure Synapse Spark 3.4 pools.
// Otherwise a NoSuchMethodError will be thrown by interpretability code.
"org.scalanlp" %% "breeze" % "2.1.0"
).map(d => d excludeAll (excludes: _*))
val dependencies = coreDependencies ++ extraDependencies

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ object PyCodegen {
// There's `Already borrowed` error found in transformers 4.16.2 when using tokenizers
s"""extras_require={"extras": [
| "cmake",
| "horovod==0.28.1",
| "horovod==0.27.0",
| "pytorch_lightning>=1.5.0,<1.5.10",
| "torch==1.13.1",
| "torchvision>=0.14.1",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ object PackageUtils {
// Use a fixed version for local testing
// val PackageMavenCoordinate = s"$PackageGroup:$PackageName:1.0.5"

private val AvroCoordinate = "org.apache.spark:spark-avro_2.12:3.4.1"
private val AvroCoordinate = "org.apache.spark:spark-avro_2.12:3.3.3"
val PackageRepository: String = SparkMLRepository

// If testing onnx package with snapshots repo, make sure to switch to using
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

package com.microsoft.azure.synapse.ml.exploratory

import breeze.stats.distributions.{ChiSquared, RandBasis}
import breeze.stats.distributions.{ChiSquared}
import com.microsoft.azure.synapse.ml.codegen.Wrappable
import com.microsoft.azure.synapse.ml.core.schema.DatasetExtensions
import com.microsoft.azure.synapse.ml.logging.{FeatureNames, SynapseMLLogging}
Expand Down Expand Up @@ -261,7 +261,6 @@ private[exploratory] case class DistributionMetrics(numFeatures: Int,

// Calculates left-tailed p-value from degrees of freedom and chi-squared test statistic
def chiSquaredPValue: Column = {
implicit val rand: RandBasis = RandBasis.mt0
val degOfFreedom = numFeatures - 1
val scoreCol = chiSquaredTestStatistic
val chiSqPValueUdf = udf(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import com.microsoft.azure.synapse.ml.core.env.StreamUtilities.using

import java.io._
import scala.collection.JavaConverters._
import breeze.linalg.functions.euclideanDistance

private case class Query(point: DenseVector[Double],
normOfQueryPoint: Double,
Expand Down Expand Up @@ -100,16 +101,16 @@ trait BallTreeBase[V] {
}

/** Performs fast lookups of nearest neighbors using the Ball Tree algorithm for space partitioning
*
* Note that this code borrows heavily from
* https://github.com/felixmaximilian/mips
*
* @author Felix Maximilian
*/
*
* Note that this code borrows heavily from
* https://github.com/felixmaximilian/mips
*
* @author Felix Maximilian
*/
case class BallTree[V](override val keys: IndexedSeq[DenseVector[Double]],
override val values: IndexedSeq[V],
override val leafSize: Int = 50) //scalastyle:ignore magic.number
extends Serializable with BallTreeBase[V] {
extends Serializable with BallTreeBase[V] {

private val root: Node = makeBallTree(pointIdx)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,20 +199,17 @@ object SparkHelpers {

def flatten(ratings: Dataset[_], num: Int, dstOutputColumn: String, srcOutputColumn: String): DataFrame = {
import ratings.sparkSession.implicits._
import org.apache.spark.sql.functions.{collect_top_k, struct}

val topKAggregator = new TopByKeyAggregator[Int, Int, Float](num, Ordering.by(_._2))
val recs = ratings.as[(Int, Int, Float)].groupByKey(_._1).agg(topKAggregator.toColumn)
.toDF("id", "recommendations")

val arrayType = ArrayType(
new StructType()
.add(dstOutputColumn, IntegerType)
.add(Constants.RatingCol, FloatType)
)

ratings.toDF(srcOutputColumn, dstOutputColumn, Constants.RatingCol).groupBy(srcOutputColumn)
.agg(collect_top_k(struct(Constants.RatingCol, dstOutputColumn), num, false))
.as[(Int, Seq[(Float, Int)])]
.map(t => (t._1, t._2.map(p => (p._2, p._1))))
.toDF(srcOutputColumn, Constants.Recommendations)
.withColumn(Constants.Recommendations, col(Constants.Recommendations).cast(arrayType))
recs.select(col("id").as(srcOutputColumn), col("recommendations").cast(arrayType))
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ object RTestGen {
| "spark.sql.shuffle.partitions=10",
| "spark.sql.crossJoin.enabled=true")
|
|sc <- spark_connect(master = "local", version = "3.4.1", config = conf)
|sc <- spark_connect(master = "local", version = "3.3.3", config = conf)
|
|""".stripMargin, StandardOpenOption.CREATE)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ object DatabricksUtilities {

// ADB Info
val Region = "eastus"
val PoolName = "synapseml-build-13.3"
val GpuPoolName = "synapseml-build-13.3-gpu"
val AdbRuntime = "13.3.x-scala2.12"
// https://docs.databricks.com/en/release-notes/runtime/13.3lts-ml.html
val AdbGpuRuntime = "13.3.x-gpu-ml-scala2.12"
val PoolName = "synapseml-build-12.2"
val GpuPoolName = "synapseml-build-12.2-gpu"
val AdbRuntime = "12.2.x-scala2.12"
// https://learn.microsoft.com/en-us/azure/databricks/release-notes/runtime/
val AdbGpuRuntime = "12.2.x-gpu-ml-scala2.12"
val NumWorkers = 5
val AutoTerminationMinutes = 15

Expand Down Expand Up @@ -84,9 +84,9 @@ object DatabricksUtilities {
Map("maven" -> Map("coordinates" -> PackageMavenCoordinate, "repo" -> PackageRepository)),
Map("pypi" -> Map("package" -> "pytorch-lightning==1.5.0")),
Map("pypi" -> Map("package" -> "torchvision==0.14.1")),
Map("pypi" -> Map("package" -> "transformers==4.32.1")),
Map("pypi" -> Map("package" -> "petastorm==0.12.0")),
Map("pypi" -> Map("package" -> "protobuf==3.20.3"))
Map("pypi" -> Map("package" -> "transformers==4.25.1")),
Map("pypi" -> Map("package" -> "petastorm==0.12.1")),
Map("pypi" -> Map("package" -> "protobuf==3.19.4"))
).toJson.compactPrint

val RapidsInitScripts: String = List(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ object SynapseExtensionUtilities {
|"{
| 'Default${store}ArtifactId': '$storeId',
| 'ExecutableFile': '$path',
| 'SparkVersion':'3.4',
| 'SparkVersion':'3.3',
| 'SparkSettings': {
| 'spark.jars.packages' : '$SparkMavenPackageList',
| 'spark.jars.repositories' : '$SparkMavenRepositoryList',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ object SynapseUtilities {
| "nodeSizeFamily": "MemoryOptimized",
| "provisioningState": "Succeeded",
| "sessionLevelPackagesEnabled": "true",
| "sparkVersion": "3.4"
| "sparkVersion": "3.3"
| }
|}
|""".stripMargin
Expand Down
10 changes: 5 additions & 5 deletions deep-learning/src/main/python/horovod_installation.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ set -eu
# Install prerequisite libraries that horovod depends on
pip install pytorch-lightning==1.5.0
pip install torchvision==0.14.1
pip install transformers==4.32.1
pip install transformers==4.25.1
pip install petastorm>=0.12.0
pip install protobuf==3.20.3
pip install protobuf==3.19.1

# Remove Outdated Signing Key:
sudo apt-key del 7fa2af80
Expand All @@ -35,13 +35,13 @@ libcusparse-dev-11-0=11.1.1.245-1

git clone --recursive https://github.com/horovod/horovod.git
cd horovod
# git fetch origin refs/tags/v0.28.1:tags/v0.28.1
git checkout 1d217b59949986d025f6db93c49943fb6b6cc78f
# git fetch origin refs/tags/v0.27.0:tags/v0.27.0
git checkout bfaca90d5cf66780a97d8799d4e1573855b64560
git checkout -b tmp-branch
rm -rf build/ dist/
HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_CUDA_HOME=/usr/local/cuda-11/ HOROVOD_WITH_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 \
/databricks/python3/bin/python setup.py bdist_wheel

readlink -f dist/horovod-*.whl

pip install --no-cache-dir dist/horovod-0.28.1-cp38-cp38-linux_x86_64.whl --force-reinstall --no-deps
pip install --no-cache-dir dist/horovod-0.27.0-cp38-cp38-linux_x86_64.whl --force-reinstall --no-deps
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@
if _TRANSFORMERS_AVAILABLE:
import transformers

_TRANSFORMERS_EQUAL_4_32_1 = transformers.__version__ == "4.32.1"
if _TRANSFORMERS_EQUAL_4_32_1:
_TRANSFORMERS_EQUAL_4_25_1 = transformers.__version__ == "4.25.1"
if _TRANSFORMERS_EQUAL_4_25_1:
from transformers import AutoTokenizer
else:
raise RuntimeError(
"transformers should be == 4.32.1, found: {}".format(
"transformers should be == 4.25.1, found: {}".format(
transformers.__version__
)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
if _HOROVOD_AVAILABLE:
import horovod

_HOROVOD_EQUAL_0_28_1 = horovod.__version__ == "0.28.1"
if not _HOROVOD_EQUAL_0_28_1:
_HOROVOD_EQUAL_0_27_0 = horovod.__version__ == "0.27.0"
if not _HOROVOD_EQUAL_0_27_0:
raise RuntimeError(
"horovod should be of version 0.28.1, found: {}".format(horovod.__version__)
"horovod should be of version 0.27.0, found: {}".format(horovod.__version__)
)
else:
raise ModuleNotFoundError("module not found: horovod")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@
if _TRANSFORMERS_AVAILABLE:
import transformers

_TRANSFORMERS_EQUAL_4_32_1 = transformers.__version__ == "4.32.1"
if _TRANSFORMERS_EQUAL_4_32_1:
_TRANSFORMERS_EQUAL_4_25_1 = transformers.__version__ == "4.25.1"
if _TRANSFORMERS_EQUAL_4_25_1:
from transformers import AutoModelForSequenceClassification
else:
raise RuntimeError(
"transformers should be == 4.32.1, found: {}".format(
"transformers should be == 4.25.1, found: {}".format(
transformers.__version__
)
)
Expand Down
18 changes: 9 additions & 9 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,18 @@ channels:
- conda-forge
- default
dependencies:
- python=3.11.8
- python=3.8.8
- requests=2.26.0
- pip=21.3
- r-base=4.1.1
- r-sparklyr=1.8.1
- r-devtools=2.4.2
- pip:
- pyarrow>=0.15.0
- pyspark==3.4.1
- pandas==1.4.0
- pyspark==3.3.3
- pandas==1.2.5
- wheel
- sphinx==5.0.2
- sphinx==4.2.0
- sphinx_paramlinks==0.5.2
- sphinx_rtd_theme
- coverage
Expand All @@ -32,17 +32,17 @@ dependencies:
- twine
- mlflow
- numpy
- torch==2.0.0
- torchvision==0.15.1
- horovod==0.28.1
- torch==1.13.1
- torchvision==0.14.1
- horovod==0.27.0
- petastorm>=0.11.0
- pytorch_lightning==1.5.0
- onnxmltools==1.7.0
- matplotlib
- Pillow
- transformers==4.32.1
- transformers==4.25.1
- huggingface-hub>=0.8.1
- langchain==0.0.152
- langchain==0.0.151
- openai==0.27.5
- black==22.3.0
- black[jupyter]==22.3.0
Expand Down
2 changes: 1 addition & 1 deletion pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,7 @@ jobs:
(timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup)
sbt codegen
sbt publishM2
SPARK_VERSION=3.4.1
SPARK_VERSION=3.3.3
HADOOP_VERSION=3
wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz
- task: AzureCLI@2
Expand Down
8 changes: 2 additions & 6 deletions project/plugins.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@ addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.9.0")
addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.8")
addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.1")
addSbtPlugin("com.dwijnand" % "sbt-dynver" % "4.0.0")
addSbtPlugin("org.scoverage" % "sbt-scoverage" % "2.0.8")
addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.9.2")
addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.10.0-RC1")
addSbtPlugin("no.arktekk.sbt" % "aether-deploy" % "0.26.0")

ThisBuild / libraryDependencySchemes ++= Seq(
"org.scala-lang.modules" %% "scala-xml" % VersionScheme.Always
)
addSbtPlugin("no.arktekk.sbt" % "aether-deploy" % "0.26.0")
3 changes: 1 addition & 2 deletions start
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
#!/bin/bash

export OPENMPI_VERSION="3.1.2"

export SPARK_VERSION="3.4.1"
export SPARK_VERSION="3.3.3"
export HADOOP_VERSION="3.3"
export SYNAPSEML_VERSION="1.0.5" # Binder compatibility version

Expand Down
2 changes: 1 addition & 1 deletion tools/docker/demo/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ FROM mcr.microsoft.com/oss/mirror/docker.io/library/ubuntu:20.04
ARG SYNAPSEML_VERSION=1.0.5
ARG DEBIAN_FRONTEND=noninteractive

ENV SPARK_VERSION=3.4.1
ENV SPARK_VERSION=3.3.3
ENV HADOOP_VERSION=3
ENV SYNAPSEML_VERSION=${SYNAPSEML_VERSION}
ENV JAVA_HOME /usr/lib/jvm/java-1.11.0-openjdk-amd64
Expand Down
2 changes: 1 addition & 1 deletion tools/docker/minimal/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ FROM mcr.microsoft.com/oss/mirror/docker.io/library/ubuntu:20.04
ARG SYNAPSEML_VERSION=1.0.5
ARG DEBIAN_FRONTEND=noninteractive

ENV SPARK_VERSION=3.4.1
ENV SPARK_VERSION=3.3.3
ENV HADOOP_VERSION=3
ENV SYNAPSEML_VERSION=${SYNAPSEML_VERSION}
ENV JAVA_HOME /usr/lib/jvm/java-1.11.0-openjdk-amd64
Expand Down
2 changes: 1 addition & 1 deletion tools/tests/run_r_tests.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ if (!require("sparklyr")) {
library("sparklyr")
}

spark_install_tar(paste(getwd(), "/../../../../../../spark-3.4.1-bin-hadoop3.tgz", sep = ""))
spark_install_tar(paste(getwd(), "/../../../../../../spark-3.3.3-bin-hadoop3.tgz", sep = ""))

options("testthat.output_file" = "../../../../r-test-results.xml")
devtools::test(reporter = JunitReporter$new())

0 comments on commit d3bbd47

Please sign in to comment.