diff --git a/.gitignore b/.gitignore index 8fd3247cec1..ec1a9ac15e9 100644 --- a/.gitignore +++ b/.gitignore @@ -47,4 +47,7 @@ node_modules/ .Rproj.user # R output -*.Rout \ No newline at end of file +*.Rout + +# Misc +.bsp diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 46b481c7130..739b6065c41 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -55,7 +55,7 @@ this process: #### Implement documentation -- Add a [sample Jupyter notebook](notebooks/samples) that shows the intended use +- Add a [sample Jupyter notebook](notebooks/) that shows the intended use case of your algorithm, with instructions in step-by-step manner. (The same notebook could be used for testing the code.) - Add in-line ScalaDoc comments to your source code, to generate the [API diff --git a/README.md b/README.md index 58c5cdcec6f..f7618c97eac 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ PySpark](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/index.htm | | | | |:--:|:--:|:--:| -| **Isolation Forest on Spark** | [**CyberML**](https://github.com/Azure/mmlspark/blob/master/notebooks/samples/CyberML%20-%20Anomalous%20Access%20Detection.ipynb) | **Conditional KNN** | +| **Isolation Forest on Spark** | [**CyberML**](https://github.com/Azure/mmlspark/blob/master/notebooks/CyberML%20-%20Anomalous%20Access%20Detection.ipynb) | **Conditional KNN** | | Distributed Nonlinear Outlier Detection | Machine Learning Tools for Cyber Security | Scalable KNN Models with Conditional Queries | @@ -86,29 +86,29 @@ PySpark](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/index.htm - Train and evaluate a flight delay prediction system ([example 2]) - Finding anomalous data access patterns using the Access Anomalies package of CyberML ([example 11]) -See our [notebooks](notebooks/samples/) for all examples. +See our [notebooks](notebooks/) for all examples. -[example 1]: notebooks/samples/Classification%20-%20Adult%20Census.ipynb "Adult Census Income Training" +[example 1]: notebooks/Classification%20-%20Adult%20Census.ipynb "Adult Census Income Training" -[example 2]: notebooks/samples/Regression%20-%20Flight%20Delays.ipynb "Regression Example with Flight Delay Dataset" +[example 2]: notebooks/Regression%20-%20Flight%20Delays.ipynb "Regression Example with Flight Delay Dataset" -[example 3]: notebooks/samples/LightGBM%20-%20Quantile%20Regression%20for%20Drug%20Discovery.ipynb "Quantile Regression with LightGBM" +[example 3]: notebooks/LightGBM%20-%20Quantile%20Regression%20for%20Drug%20Discovery.ipynb "Quantile Regression with LightGBM" -[example 4]: notebooks/samples/TextAnalytics%20-%20Amazon%20Book%20Reviews.ipynb "Amazon Book Reviews - TextFeaturizer" +[example 4]: notebooks/TextAnalytics%20-%20Amazon%20Book%20Reviews.ipynb "Amazon Book Reviews - TextFeaturizer" -[example 5]: notebooks/samples/HyperParameterTuning%20-%20Fighting%20Breast%20Cancer.ipynb "Hyperparameter Tuning with MMLSpark" +[example 5]: notebooks/HyperParameterTuning%20-%20Fighting%20Breast%20Cancer.ipynb "Hyperparameter Tuning with MMLSpark" -[example 6]: notebooks/samples/DeepLearning%20-%20CIFAR10%20Convolutional%20Network.ipynb "CIFAR10 CNTK CNN Evaluation" +[example 6]: notebooks/DeepLearning%20-%20CIFAR10%20Convolutional%20Network.ipynb "CIFAR10 CNTK CNN Evaluation" -[example 7]: notebooks/samples/OpenCV%20-%20Pipeline%20Image%20Transformations.ipynb "Pipeline Image Transformations" +[example 7]: notebooks/OpenCV%20-%20Pipeline%20Image%20Transformations.ipynb "Pipeline Image Transformations" -[example 8]: notebooks/samples/DeepLearning%20-%20BiLSTM%20Medical%20Entity%20Extraction.ipynb "Medical Entity Extraction" +[example 8]: notebooks/DeepLearning%20-%20BiLSTM%20Medical%20Entity%20Extraction.ipynb "Medical Entity Extraction" -[example 9]: notebooks/samples/DeepLearning%20-%20Flower%20Image%20Classification.ipynb "Deep Flower Classification" +[example 9]: notebooks/DeepLearning%20-%20Flower%20Image%20Classification.ipynb "Deep Flower Classification" [example 10]: notebooks/gpu/DeepLearning%20-%20Distributed%20CNTK%20training.ipynb "CIFAR10 CNTK CNN Training" -[example 11]: notebooks/samples/CyberML%20-%20Anomalous%20Access%20Detection.ipynb "Access Anomalies documenation, training and evaluation example" +[example 11]: notebooks/CyberML%20-%20Anomalous%20Access%20Detection.ipynb "Access Anomalies documenation, training and evaluation example" ## A short example @@ -127,7 +127,7 @@ scoredImages = cntkModel.transform(imagesWithLabels) ... ``` -See [other sample notebooks](notebooks/samples/) as well as the MMLSpark +See [other sample notebooks](notebooks/) as well as the MMLSpark documentation for [Scala](http://mmlspark.azureedge.net/docs/scala/) and [PySpark](http://mmlspark.azureedge.net/docs/pyspark/). diff --git a/build.sbt b/build.sbt index 71471d683ee..fd8fad623ad 100644 --- a/build.sbt +++ b/build.sbt @@ -1,22 +1,20 @@ import java.io.{File, PrintWriter} import java.net.URL + import org.apache.commons.io.FileUtils import sbt.ExclusionRule -import sbt.internal.util.ManagedLogger import scala.xml.{Node => XmlNode, NodeSeq => XmlNodeSeq, _} import scala.xml.transform.{RewriteRule, RuleTransformer} -import scala.sys.process.Process import BuildUtils._ +import xerial.sbt.Sonatype._ val condaEnvName = "mmlspark" -name := "mmlspark" -organization := "com.microsoft.ml.spark" -scalaVersion := "2.12.10" val sparkVersion = "3.1.2" +name := "mmlspark" +ThisBuild / organization := "com.microsoft.ml.spark" +ThisBuild / scalaVersion := "2.12.10" -//val scalaMajorVersion = settingKey[String]("scalaMajorVersion") -//scalaMajorVersion := {scalaVersion.value.split(".".toCharArray).dropRight(0).mkString(".")} val scalaMajorVersion = 2.12 val excludes = Seq( @@ -24,42 +22,28 @@ val excludes = Seq( ExclusionRule("org.scalatest") ) -libraryDependencies ++= Seq( +val coreDependencies = Seq( "org.apache.spark" %% "spark-core" % sparkVersion % "compile", "org.apache.spark" %% "spark-mllib" % sparkVersion % "compile", "org.apache.spark" %% "spark-avro" % sparkVersion % "provided", "org.apache.spark" %% "spark-tags" % sparkVersion % "test", "org.scalatest" %% "scalatest" % "3.0.5" % "test") - -libraryDependencies ++= Seq( +val extraDependencies = Seq( "org.scalactic" %% "scalactic" % "3.0.5", "io.spray" %% "spray-json" % "1.3.2", - "com.microsoft.cntk" % "cntk" % "2.4", - "org.openpnp" % "opencv" % "3.2.0-1", "com.jcraft" % "jsch" % "0.1.54", - "com.microsoft.cognitiveservices.speech" % "client-sdk" % "1.14.0", "org.apache.httpcomponents" % "httpclient" % "4.5.6", "org.apache.httpcomponents" % "httpmime" % "4.5.6", - "com.microsoft.ml.lightgbm" % "lightgbmlib" % "3.2.110", - "com.github.vowpalwabbit" % "vw-jni" % "8.9.1", "com.linkedin.isolation-forest" %% "isolation-forest_3.0.0" % "1.0.1", ).map(d => d excludeAll (excludes: _*)) +val dependencies = coreDependencies ++ extraDependencies def txt(e: Elem, label: String): String = "\"" + e.child.filter(_.label == label).flatMap(_.text).mkString + "\"" -def activateCondaEnv: Seq[String] = { - if (sys.props("os.name").toLowerCase.contains("windows")) { - osPrefix ++ Seq("activate", condaEnvName, "&&") - } else { - Seq() - //TODO figure out why this doesent work - //Seq("/bin/bash", "-l", "-c", "source activate " + condaEnvName, "&&") - } -} - val omittedDeps = Set(s"spark-core_${scalaMajorVersion}", s"spark-mllib_${scalaMajorVersion}", "org.scala-lang") // skip dependency elements with a scope -pomPostProcess := { (node: XmlNode) => + +def pomPostFunc(node: XmlNode): scala.xml.Node = { new RuleTransformer(new RewriteRule { override def transform(node: XmlNode): XmlNodeSeq = node match { case e: Elem if e.label == "dependency" @@ -77,191 +61,17 @@ pomPostProcess := { (node: XmlNode) => }).transform(node).head } -resolvers += "Speech" at "https://mmlspark.blob.core.windows.net/maven/" - -val createCondaEnvTask = TaskKey[Unit]("createCondaEnv", "create conda env") -createCondaEnvTask := { - val s = streams.value - val hasEnv = Process("conda env list").lineStream.toList - .map(_.split("\\s+").head).contains(condaEnvName) - if (!hasEnv) { - runCmd(Seq("conda", "env", "create", "-f", "environment.yaml")) - } else { - println("Found conda env " + condaEnvName) - } -} - -val condaEnvLocation = TaskKey[String]("condaEnvLocation", "get install location of conda env") -condaEnvLocation := { - val s = streams.value - createCondaEnvTask.value - Process("conda env list").lineStream.toList - .map(_.split("\\s+")) - .map(l => (l.head, l.reverse.head)) - .filter(p => p._1 == condaEnvName) - .head._2 -} - +pomPostProcess := pomPostFunc -val cleanCondaEnvTask = TaskKey[Unit]("cleanCondaEnv", "create conda env") -cleanCondaEnvTask := { - runCmd(Seq("conda", "env", "remove", "--name", condaEnvName, "-y")) -} - -val codegenTask = TaskKey[Unit]("codegen", "Generate Code") -codegenTask := { - (runMain in Test).toTask(" com.microsoft.ml.spark.codegen.CodeGen").value -} - -val testgenTask = TaskKey[Unit]("testgen", "Generate Tests") -testgenTask := { - (runMain in Test).toTask(" com.microsoft.ml.spark.codegen.TestGen").value -} - -val genDir = join("target", s"scala-${scalaMajorVersion}", "generated") -val unidocDir = join("target", s"scala-${scalaMajorVersion}", "unidoc") -val pythonSrcDir = join(genDir.toString, "src", "python") -val unifiedDocDir = join(genDir.toString, "doc") -val pythonDocDir = join(unifiedDocDir.toString, "pyspark") -val pythonPackageDir = join(genDir.toString, "package", "python") -val pythonTestDir = join(genDir.toString, "test", "python") -val rSrcDir = join(genDir.toString, "src", "R", "mmlspark") -val rPackageDir = join(genDir.toString, "package", "R") - -val pythonizedVersion = settingKey[String]("Pythonized version") -pythonizedVersion := { - if (version.value.contains("-")) { - version.value.split("-".head).head + ".dev1" - } else { - version.value - } -} - -val rVersion = settingKey[String]("R version") -rVersion := { - if (version.value.contains("-")) { - version.value.split("-".head).head - } else { - version.value - } -} - -def rCmd(cmd: Seq[String], wd: File, libPath: String): Unit = { - runCmd(activateCondaEnv ++ cmd, wd, Map("R_LIBS" -> libPath, "R_USER_LIBS" -> libPath)) -} - -val packageR = TaskKey[Unit]("packageR", "Generate roxygen docs and zip R package") -packageR := { - createCondaEnvTask.value - codegenTask.value - val libPath = join(condaEnvLocation.value, "Lib", "R", "library").toString - rCmd(Seq("R", "-q", "-e", "roxygen2::roxygenise()"), rSrcDir, libPath) - rPackageDir.mkdirs() - zipFolder(rSrcDir, new File(rPackageDir, s"mmlspark-${version.value}.zip")) -} - -val testR = TaskKey[Unit]("testR", "Run testthat on R tests") -testR := { - packageR.value - publishLocal.value - val libPath = join(condaEnvLocation.value, "Lib", "R", "library").toString - rCmd(Seq("R", "CMD", "INSTALL", "--no-multiarch", "--with-keep.source", "mmlspark"), rSrcDir.getParentFile, libPath) - val testRunner = join("tools", "tests", "run_r_tests.R").getAbsolutePath - rCmd(Seq("Rscript", testRunner), rSrcDir, libPath) -} - -val publishR = TaskKey[Unit]("publishR", "publish R package to blob") -publishR := { - codegenTask.value - packageR.value - val rPackage = rPackageDir.listFiles().head - singleUploadToBlob(rPackage.toString, rPackage.getName, "rrr") -} - -val packagePythonTask = TaskKey[Unit]("packagePython", "Package python sdk") -packagePythonTask := { - codegenTask.value - createCondaEnvTask.value - val destPyDir = join("target", s"scala-${scalaMajorVersion}", "classes", "mmlspark") - if (destPyDir.exists()) FileUtils.forceDelete(destPyDir) - FileUtils.copyDirectory(join(pythonSrcDir.getAbsolutePath, "mmlspark"), destPyDir) - runCmd( - activateCondaEnv ++ - Seq(s"python", "setup.py", "bdist_wheel", "--universal", "-d", s"${pythonPackageDir.absolutePath}"), - pythonSrcDir) -} - -val installPipPackageTask = TaskKey[Unit]("installPipPackage", "install python sdk") -installPipPackageTask := { - packagePythonTask.value - publishLocal.value - runCmd( - activateCondaEnv ++ Seq("pip", "install", "-I", - s"mmlspark-${pythonizedVersion.value}-py2.py3-none-any.whl"), - pythonPackageDir) -} - -val generatePythonDoc = TaskKey[Unit]("generatePythonDoc", "Generate sphinx docs for python") -generatePythonDoc := { - installPipPackageTask.value - runCmd(activateCondaEnv ++ Seq("sphinx-apidoc", "-f", "-o", "doc", "."), - join(pythonSrcDir.toString, "mmlspark")) - runCmd(activateCondaEnv ++ Seq("sphinx-build", "-b", "html", "doc", "../../../doc/pyspark"), - join(pythonSrcDir.toString, "mmlspark")) -} - -val publishDocs = TaskKey[Unit]("publishDocs", "publish docs for scala and python") -publishDocs := { - generatePythonDoc.value - (Compile / unidoc).value - val html = - """ - |
-      |pyspark/
-      |scala/
-      |
- """.stripMargin - val scalaDir = join(unifiedDocDir.toString, "scala") - if (scalaDir.exists()) FileUtils.forceDelete(scalaDir) - FileUtils.copyDirectory(unidocDir, scalaDir) - FileUtils.writeStringToFile(join(unifiedDocDir.toString, "index.html"), html, "utf-8") - uploadToBlob(unifiedDocDir.toString, version.value, "docs") -} - -val publishPython = TaskKey[Unit]("publishPython", "publish python wheel") -publishPython := { - publishLocal.value - packagePythonTask.value - singleUploadToBlob( - join(pythonPackageDir.toString, s"mmlspark-${pythonizedVersion.value}-py2.py3-none-any.whl").toString, - version.value + s"/mmlspark-${pythonizedVersion.value}-py2.py3-none-any.whl", - "pip") -} - -val testPythonTask = TaskKey[Unit]("testPython", "test python sdk") - -testPythonTask := { - installPipPackageTask.value - testgenTask.value - runCmd( - activateCondaEnv ++ Seq("python", - "-m", - "pytest", - "--cov=mmlspark", - "--junitxml=../../../../python-test-results.xml", - "--cov-report=xml", - "mmlsparktest" - ), - new File(s"target/scala-${scalaMajorVersion}/generated/test/python/") - ) -} +val speechResolver = "Speech" at "https://mmlspark.blob.core.windows.net/maven/" val getDatasetsTask = TaskKey[Unit]("getDatasets", "download datasets used for testing") val datasetName = "datasets-2020-08-27.tgz" val datasetUrl = new URL(s"https://mmlspark.blob.core.windows.net/installers/$datasetName") val datasetDir = settingKey[File]("The directory that holds the dataset") -datasetDir := { - join(target.value.toString, s"scala-${scalaMajorVersion}", "datasets", datasetName.split(".".toCharArray.head).head) +ThisBuild / datasetDir := { + join(artifactPath.in(packageBin).in(Compile).value.getParentFile, + "datasets", datasetName.split(".".toCharArray.head).head) } getDatasetsTask := { @@ -276,48 +86,61 @@ getDatasetsTask := { val genBuildInfo = TaskKey[Unit]("genBuildInfo", "generate a build info file") genBuildInfo := { - val buildInfo = + val docInfo = s""" - |MMLSpark Build and Release Information - |--------------- - | - |### Maven Coordinates - | `${organization.value}:${name.value}_${scalaMajorVersion}:${version.value}` - | - |### Maven Resolver - | `https://mmlspark.azureedge.net/maven` | |### Documentation Pages: |[Scala Documentation](https://mmlspark.blob.core.windows.net/docs/${version.value}/scala/index.html) |[Python Documentation](https://mmlspark.blob.core.windows.net/docs/${version.value}/pyspark/index.html) | """.stripMargin + val buildInfo = (root / blobArtifactInfo).value + docInfo val infoFile = join("target", "Build.md") if (infoFile.exists()) FileUtils.forceDelete(infoFile) FileUtils.writeStringToFile(infoFile, buildInfo, "utf-8") } -val setupTask = TaskKey[Unit]("setup", "set up library for intellij") -setupTask := { - (Compile / compile).toTask.value - (Test / compile).toTask.value - getDatasetsTask.value +val rootGenDir = SettingKey[File]("rootGenDir") +rootGenDir := { + val targetDir = artifactPath.in(packageBin).in(Compile).in(root).value.getParentFile + join(targetDir, "generated") } -val publishBlob = TaskKey[Unit]("publishBlob", "publish the library to mmlspark blob") -publishBlob := { - publishM2.value - val scalaVersionSuffix = scalaVersion.value.split(".".toCharArray.head).dropRight(1).mkString(".") - val nameAndScalaVersion = s"${name.value}_$scalaVersionSuffix" - - val localPackageFolder = join( - Seq(new File(new URI(Resolver.mavenLocal.root)).getAbsolutePath) - ++ organization.value.split(".".toCharArray.head) - ++ Seq(nameAndScalaVersion, version.value): _*).toString +val generatePythonDoc = TaskKey[Unit]("generatePythonDoc", "Generate sphinx docs for python") +generatePythonDoc := { + installPipPackage.all(ScopeFilter( + inProjects(core, deepLearning, cognitive, vw, lightgbm, opencv), + inConfigurations(Compile))).value + mergePyCode.all(ScopeFilter( + inProjects(core, deepLearning, cognitive, vw, lightgbm, opencv), + inConfigurations(Compile)) + ).value + val targetDir = artifactPath.in(packageBin).in(Compile).in(root).value.getParentFile + val codegenDir = join(targetDir, "generated") + val dir = join(codegenDir, "src", "python", "mmlspark") + runCmd(activateCondaEnv.value ++ Seq("sphinx-apidoc", "-f", "-o", "doc", "."), dir) + runCmd(activateCondaEnv.value ++ Seq("sphinx-build", "-b", "html", "doc", "../../../doc/pyspark"), dir) +} - val blobMavenFolder = organization.value.replace(".", "/") + - s"/$nameAndScalaVersion/${version.value}" - uploadToBlob(localPackageFolder, blobMavenFolder, "maven") +val publishDocs = TaskKey[Unit]("publishDocs", "publish docs for scala and python") +publishDocs := { + //generatePythonDoc.value + (root / Compile / unidoc).value + val html = + """ + |
+      |pyspark/
+      |scala/
+      |
+ """.stripMargin + val targetDir = artifactPath.in(packageBin).in(Compile).in(root).value.getParentFile + val codegenDir = join(targetDir, "generated") + val unifiedDocDir = join(codegenDir, "doc") + val scalaDir = join(unifiedDocDir.toString, "scala") + if (scalaDir.exists()) FileUtils.forceDelete(scalaDir) + FileUtils.copyDirectory(join(targetDir, "unidoc"), scalaDir) + FileUtils.writeStringToFile(join(unifiedDocDir.toString, "index.html"), html, "utf-8") + uploadToBlob(unifiedDocDir.toString, version.value, "docs") } val release = TaskKey[Unit]("release", "publish the library to mmlspark blob") @@ -355,11 +178,8 @@ publishBadges := { } val settings = Seq( - (scalastyleConfig in Test) := baseDirectory.value / "scalastyle-test-config.xml", + (scalastyleConfig in Test) := (ThisBuild / baseDirectory).value / "scalastyle-test-config.xml", logBuffered in Test := false, - buildInfoKeys := Seq[BuildInfoKey]( - name, version, scalaVersion, sbtVersion, - baseDirectory, datasetDir, pythonizedVersion, rVersion), parallelExecution in Test := false, test in assembly := {}, assemblyMergeStrategy in assembly := { @@ -367,14 +187,90 @@ val settings = Seq( case x => MergeStrategy.first }, assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false), - buildInfoPackage := "com.microsoft.ml.spark.build") - -lazy val mmlspark = (project in file(".")) - .enablePlugins(BuildInfoPlugin) - .enablePlugins(ScalaUnidocPlugin) - .settings(settings: _*) + autoAPIMappings := true, + pomPostProcess := pomPostFunc, +) +ThisBuild / publishMavenStyle := true + +lazy val core = (project in file("core")) + .enablePlugins(BuildInfoPlugin && SbtPlugin) + .settings((settings ++ Seq( + libraryDependencies ++= dependencies, + buildInfoKeys ++= Seq[BuildInfoKey]( + datasetDir, + version, + scalaVersion, + sbtVersion, + baseDirectory + ), + name := "mmlspark-core", + buildInfoPackage := "com.microsoft.ml.spark.build", + )): _*) + +lazy val deepLearning = (project in file("deep-learning")) + .enablePlugins(SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + libraryDependencies += ("com.microsoft.cntk" % "cntk" % "2.4"), + name := "mmlspark-deep-learning", + )): _*) + +lazy val lightgbm = (project in file("lightgbm")) + .enablePlugins(SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + libraryDependencies += ("com.microsoft.ml.lightgbm" % "lightgbmlib" % "3.2.110"), + name := "mmlspark-lightgbm" + )): _*) + +lazy val vw = (project in file("vw")) + .enablePlugins(SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + libraryDependencies += ("com.github.vowpalwabbit" % "vw-jni" % "8.9.1"), + name := "mmlspark-vw" + )): _*) + +lazy val cognitive = (project in file("cognitive")) + .enablePlugins(SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + libraryDependencies += ("com.microsoft.cognitiveservices.speech" % "client-sdk" % "1.14.0"), + resolvers += speechResolver, + name := "mmlspark-cognitive" + )): _*) + +lazy val opencv = (project in file("opencv")) + .enablePlugins(SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + libraryDependencies += ("org.openpnp" % "opencv" % "3.2.0-1"), + name := "mmlspark-opencv" + )): _*) + +lazy val root = (project in file(".")) + .aggregate(core, deepLearning, cognitive, vw, lightgbm, opencv) + .dependsOn( + core % "test->test;compile->compile", + deepLearning % "test->test;compile->compile", + cognitive % "test->test;compile->compile", + vw % "test->test;compile->compile", + lightgbm % "test->test;compile->compile", + opencv % "test->test;compile->compile") + .enablePlugins(ScalaUnidocPlugin && SbtPlugin) + .disablePlugins(CodegenPlugin) + .settings(settings ++ Seq( + name := "mmlspark", + )) -import xerial.sbt.Sonatype._ +val setupTask = TaskKey[Unit]("setup", "set up library for intellij") +setupTask := { + compile.all(ScopeFilter( + inProjects(root, core, deepLearning, cognitive, vw, lightgbm, opencv), + inConfigurations(Compile, Test)) + ).value + getDatasetsTask.value +} sonatypeProjectHosting := Some( GitHubHosting("Azure", "MMLSpark", "mmlspark-support@microsot.com")) @@ -389,33 +285,30 @@ developers := List( ) licenses += ("MIT", url("https://github.com/Azure/mmlspark/blob/master/LICENSE")) -publishMavenStyle := true - -credentials += Credentials("Sonatype Nexus Repository Manager", - "oss.sonatype.org", - Secrets.nexusUsername, - Secrets.nexusPassword) - -pgpPassphrase := Some(Secrets.pgpPassword.toCharArray) -pgpSecretRing := { - val temp = File.createTempFile("secret", ".asc") - new PrintWriter(temp) { - write(Secrets.pgpPrivate); - close() - } - temp -} -pgpPublicRing := { - val temp = File.createTempFile("public", ".asc") - new PrintWriter(temp) { - write(Secrets.pgpPublic); - close() - } - temp -} +// +//credentials += Credentials("Sonatype Nexus Repository Manager", +// "oss.sonatype.org", +// Secrets.nexusUsername, +// Secrets.nexusPassword) +// +//pgpPassphrase := Some(Secrets.pgpPassword.toCharArray) +//pgpSecretRing := { +// val temp = File.createTempFile("secret", ".asc") +// new PrintWriter(temp) { +// write(Secrets.pgpPrivate); +// close() +// } +// temp +//} +//pgpPublicRing := { +// val temp = File.createTempFile("public", ".asc") +// new PrintWriter(temp) { +// write(Secrets.pgpPublic); +// close() +// } +// temp +//} +//publishTo := sonatypePublishToBundle.value dynverSonatypeSnapshots in ThisBuild := true dynverSeparator in ThisBuild := "-" -publishTo := sonatypePublishToBundle.value - -// Break Cache - 1 diff --git a/src/main/python/mmlspark/cognitive/AzureSearchWriter.py b/cognitive/src/main/python/mmlspark/cognitive/AzureSearchWriter.py similarity index 100% rename from src/main/python/mmlspark/cognitive/AzureSearchWriter.py rename to cognitive/src/main/python/mmlspark/cognitive/AzureSearchWriter.py diff --git a/src/main/python/mmlspark/cognitive/BingImageSearch.py b/cognitive/src/main/python/mmlspark/cognitive/BingImageSearch.py similarity index 100% rename from src/main/python/mmlspark/cognitive/BingImageSearch.py rename to cognitive/src/main/python/mmlspark/cognitive/BingImageSearch.py diff --git a/src/__init__.py b/cognitive/src/main/python/mmlspark/cognitive/__init__.py similarity index 100% rename from src/__init__.py rename to cognitive/src/main/python/mmlspark/cognitive/__init__.py diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AnamolyDetection.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AnamolyDetection.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AnamolyDetection.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AnamolyDetection.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AnomalyDetectorSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AnomalyDetectorSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AnomalyDetectorSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AnomalyDetectorSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AudioStreams.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AudioStreams.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AudioStreams.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AudioStreams.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala similarity index 99% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala index 96024a68b63..b405bb13b09 100644 --- a/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala +++ b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala @@ -143,7 +143,8 @@ object AzureSearchWriter extends IndexParser with SLogging { val Logger: Logger = LogManager.getRootLogger - private def checkForErrors(fatal: Boolean)(errorRow: Row, inputRow: Row): Option[Row] = { + private def checkForErrors( + fatal: Boolean)(errorRow: Row, inputRow: Row): Option[Row] = { Option(errorRow).map { r => val message = s"Service Exception:\n\t ${r.toString()} \n for input:\n\t ${inputRow.toString()}" if (fatal) { diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchAPI.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchAPI.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchAPI.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchAPI.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/BingImageSearch.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/BingImageSearch.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/BingImageSearch.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/BingImageSearch.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceBase.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceBase.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceBase.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceBase.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVision.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVision.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVision.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVision.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVisionSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVisionSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVisionSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVisionSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/Face.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/Face.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/Face.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/Face.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/FaceSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/FaceSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/FaceSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/FaceSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/ImageSearchSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ImageSearchSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/ImageSearchSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ImageSearchSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/OCRSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/OCRSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/OCRSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/OCRSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/RESTHelpers.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/RESTHelpers.scala similarity index 96% rename from src/main/scala/com/microsoft/ml/spark/cognitive/RESTHelpers.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/RESTHelpers.scala index 3e4dc4e4a14..01de211a8e0 100644 --- a/src/main/scala/com/microsoft/ml/spark/cognitive/RESTHelpers.scala +++ b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/RESTHelpers.scala @@ -59,11 +59,11 @@ object RESTHelpers { response } else { val requestBodyOpt = Try(request match { - case er: HttpEntityEnclosingRequestBase => IOUtils.toString(er.getEntity.getContent) + case er: HttpEntityEnclosingRequestBase => IOUtils.toString(er.getEntity.getContent, "UTF-8") case _ => "" }).get - val responseBodyOpt = Try(IOUtils.toString(response.getEntity.getContent)).getOrElse("") + val responseBodyOpt = Try(IOUtils.toString(response.getEntity.getContent, "UTF-8")).getOrElse("") throw new RuntimeException( s"Failed: " + diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechAPI.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechAPI.scala similarity index 93% rename from src/main/scala/com/microsoft/ml/spark/cognitive/SpeechAPI.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechAPI.scala index 361c63507cf..b240da1a95f 100644 --- a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechAPI.scala +++ b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechAPI.scala @@ -32,7 +32,7 @@ object SpeechAPI { using(Client.execute(request)) { response => if (!response.getStatusLine.getStatusCode.toString.startsWith("2")) { val bodyOpt = request match { - case er: HttpEntityEnclosingRequestBase => IOUtils.toString(er.getEntity.getContent) + case er: HttpEntityEnclosingRequestBase => IOUtils.toString(er.getEntity.getContent, "UTF-8") case _ => "" } throw new RuntimeException( @@ -40,7 +40,7 @@ object SpeechAPI { s"requestUrl: ${request.getURI}" + s"requestBody: $bodyOpt") } - IOUtils.toString(response.getEntity.getContent) + IOUtils.toString(response.getEntity.getContent, "UTF-8") .parseJson.asJsObject().fields("Signature").compactPrint }.get }) diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/SpeechSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToText.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToText.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToText.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToText.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala similarity index 98% rename from src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala index 51a965b0d08..45447ac5f2d 100644 --- a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala +++ b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala @@ -8,15 +8,17 @@ import java.lang.ProcessBuilder.Redirect import java.net.{URI, URL} import java.util.UUID import java.util.concurrent.{LinkedBlockingQueue, TimeUnit} + import com.microsoft.cognitiveservices.speech._ import com.microsoft.cognitiveservices.speech.audio._ -import com.microsoft.cognitiveservices.speech.transcription.{Conversation, ConversationTranscriber, - ConversationTranscriptionEventArgs, Participant} +import com.microsoft.cognitiveservices.speech.transcription.{ + Conversation, ConversationTranscriber, ConversationTranscriptionEventArgs, Participant} import com.microsoft.cognitiveservices.speech.util.EventHandler import com.microsoft.ml.spark.build.BuildInfo import com.microsoft.ml.spark.cognitive.SpeechFormat._ import com.microsoft.ml.spark.core.contracts.HasOutputCol import com.microsoft.ml.spark.core.schema.{DatasetExtensions, SparkBindings} +import com.microsoft.ml.spark.core.utils.OsUtils import com.microsoft.ml.spark.io.http.HasURL import com.microsoft.ml.spark.logging.BasicLogging import com.microsoft.ml.spark.{CompressedStream, WavStream} @@ -36,10 +38,6 @@ import spray.json._ import scala.concurrent.{ExecutionContext, Future, blocking} import scala.language.existentials -object OsUtils { - val IsWindows: Boolean = System.getProperty("os.name").toLowerCase().indexOf("win") >= 0 -} - object SpeechToTextSDK extends ComplexParamsReadable[SpeechToTextSDK] private[ml] class BlockingQueueIterator[T](lbq: LinkedBlockingQueue[Option[T]], diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalytics.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalytics.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalytics.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalytics.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemasV2.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemasV2.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemasV2.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemasV2.scala diff --git a/src/main/__init__.py b/cognitive/src/test/python/mmlsparktest/cognitive/__init__.py similarity index 100% rename from src/main/__init__.py rename to cognitive/src/test/python/mmlsparktest/cognitive/__init__.py diff --git a/src/test/python/mmlsparktest/cognitive/test_simple.py b/cognitive/src/test/python/mmlsparktest/cognitive/test_simple.py similarity index 100% rename from src/test/python/mmlsparktest/cognitive/test_simple.py rename to cognitive/src/test/python/mmlsparktest/cognitive/test_simple.py diff --git a/src/test/resources/audio1.txt b/cognitive/src/test/resources/audio1.txt similarity index 100% rename from src/test/resources/audio1.txt rename to cognitive/src/test/resources/audio1.txt diff --git a/src/test/resources/audio1.wav b/cognitive/src/test/resources/audio1.wav similarity index 100% rename from src/test/resources/audio1.wav rename to cognitive/src/test/resources/audio1.wav diff --git a/src/test/resources/audio2.txt b/cognitive/src/test/resources/audio2.txt similarity index 100% rename from src/test/resources/audio2.txt rename to cognitive/src/test/resources/audio2.txt diff --git a/src/test/resources/audio2.wav b/cognitive/src/test/resources/audio2.wav similarity index 100% rename from src/test/resources/audio2.wav rename to cognitive/src/test/resources/audio2.wav diff --git a/src/test/resources/audio3.mp3 b/cognitive/src/test/resources/audio3.mp3 similarity index 100% rename from src/test/resources/audio3.mp3 rename to cognitive/src/test/resources/audio3.mp3 diff --git a/src/test/resources/audio3.txt b/cognitive/src/test/resources/audio3.txt similarity index 100% rename from src/test/resources/audio3.txt rename to cognitive/src/test/resources/audio3.txt diff --git a/src/test/resources/audio4.txt b/cognitive/src/test/resources/audio4.txt similarity index 100% rename from src/test/resources/audio4.txt rename to cognitive/src/test/resources/audio4.txt diff --git a/src/test/resources/dialogue.mp3 b/cognitive/src/test/resources/dialogue.mp3 similarity index 100% rename from src/test/resources/dialogue.mp3 rename to cognitive/src/test/resources/dialogue.mp3 diff --git a/src/test/resources/lily.wav b/cognitive/src/test/resources/lily.wav similarity index 100% rename from src/test/resources/lily.wav rename to cognitive/src/test/resources/lily.wav diff --git a/src/test/resources/mark.wav b/cognitive/src/test/resources/mark.wav similarity index 100% rename from src/test/resources/mark.wav rename to cognitive/src/test/resources/mark.wav diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/AnamolyDetectionSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/AnamolyDetectionSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/AnamolyDetectionSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/AnamolyDetectionSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala similarity index 99% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala index 11a75834a4f..6255d9462b4 100644 --- a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala +++ b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala @@ -9,12 +9,10 @@ import com.microsoft.ml.spark.core.test.base.{Flaky, TestBase} import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.ml.NamespaceInjections.pipelineModel import org.apache.spark.ml.util.MLReadable -import org.apache.spark.sql.functions.{corr, typedLit} +import org.apache.spark.sql.functions.typedLit import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.scalactic.Equality -import org.scalatest.Assertion import com.microsoft.ml.spark.FluentAPI._ -import com.microsoft.ml.spark.featurize.text.PageSplitter trait CognitiveKey { lazy val cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", Secrets.CognitiveApiKey) diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceAPI.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceAPI.scala similarity index 98% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceAPI.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceAPI.scala index 3cc8c4eefcf..3b1744c63f4 100644 --- a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceAPI.scala +++ b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceAPI.scala @@ -35,7 +35,7 @@ object FaceUtils extends CognitiveKey { using(Client.execute(request)) { response => if (!response.getStatusLine.getStatusCode.toString.startsWith("2")) { val bodyOpt = request match { - case er: HttpEntityEnclosingRequestBase => IOUtils.toString(er.getEntity.getContent) + case er: HttpEntityEnclosingRequestBase => IOUtils.toString(er.getEntity.getContent, "UTF-8") case _ => "" } throw new RuntimeException( @@ -43,7 +43,7 @@ object FaceUtils extends CognitiveKey { s"requestUrl: ${request.getURI}" + s"requestBody: $bodyOpt") } - IOUtils.toString(response.getEntity.getContent) + IOUtils.toString(response.getEntity.getContent, "UTF-8") }.get }) } diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ImageSearchSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ImageSearchSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/ImageSearchSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ImageSearchSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/TextAnalyticsSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/TextAnalyticsSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/TextAnalyticsSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/TextAnalyticsSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SearchWriterSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SearchWriterSuite.scala similarity index 99% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split2/SearchWriterSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SearchWriterSuite.scala index 0f543420bd2..9b8d91af8ae 100644 --- a/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SearchWriterSuite.scala +++ b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SearchWriterSuite.scala @@ -14,7 +14,8 @@ import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.http.client.methods.HttpDelete import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.functions.{lit, udf, col, split} +import org.apache.spark.sql.functions.{col, lit, split, udf} + import scala.collection.mutable import scala.concurrent.blocking diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SpeechToTextSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SpeechToTextSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split2/SpeechToTextSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SpeechToTextSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split3/SpeechToTextSDKSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split3/SpeechToTextSDKSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split3/SpeechToTextSDKSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split3/SpeechToTextSDKSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/utils/ModelEqualitySuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/core/utils/utils/ModelEqualitySuite.scala similarity index 94% rename from src/test/scala/com/microsoft/ml/spark/core/utils/ModelEqualitySuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/core/utils/utils/ModelEqualitySuite.scala index 620ab98aa28..d88d70d63af 100644 --- a/src/test/scala/com/microsoft/ml/spark/core/utils/ModelEqualitySuite.scala +++ b/cognitive/src/test/scala/com/microsoft/ml/spark/core/utils/utils/ModelEqualitySuite.scala @@ -1,11 +1,12 @@ // Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. -package com.microsoft.ml.spark.core.utils +package com.microsoft.ml.spark.core.utils.utils import com.microsoft.ml.spark.cognitive.TextSentiment import com.microsoft.ml.spark.core.env.FileUtilities.join import com.microsoft.ml.spark.core.test.base.TestBase +import com.microsoft.ml.spark.core.utils.ModelEquality import com.microsoft.ml.spark.stages.DropColumns class ModelEqualitySuite extends TestBase { diff --git a/src/test/scala/com/microsoft/ml/spark/core/utils/SlicerFunctionsSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/core/utils/utils/SlicerFunctionsSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/utils/SlicerFunctionsSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/core/utils/utils/SlicerFunctionsSuite.scala diff --git a/src/main/python/LICENSE.txt b/core/src/main/python/LICENSE.txt similarity index 100% rename from src/main/python/LICENSE.txt rename to core/src/main/python/LICENSE.txt diff --git a/src/main/python/MANIFEST.in b/core/src/main/python/MANIFEST.in similarity index 100% rename from src/main/python/MANIFEST.in rename to core/src/main/python/MANIFEST.in diff --git a/src/main/python/__init__.py b/core/src/main/python/__init__.py similarity index 100% rename from src/main/python/__init__.py rename to core/src/main/python/__init__.py diff --git a/src/main/python/mmlspark/README.txt b/core/src/main/python/mmlspark/README.txt similarity index 100% rename from src/main/python/mmlspark/README.txt rename to core/src/main/python/mmlspark/README.txt diff --git a/src/main/python/mmlspark/__init__.py b/core/src/main/python/mmlspark/__init__.py similarity index 100% rename from src/main/python/mmlspark/__init__.py rename to core/src/main/python/mmlspark/__init__.py diff --git a/src/main/python/mmlspark/automl/BestModel.py b/core/src/main/python/mmlspark/automl/BestModel.py similarity index 100% rename from src/main/python/mmlspark/automl/BestModel.py rename to core/src/main/python/mmlspark/automl/BestModel.py diff --git a/src/main/python/mmlspark/automl/HyperparamBuilder.py b/core/src/main/python/mmlspark/automl/HyperparamBuilder.py similarity index 100% rename from src/main/python/mmlspark/automl/HyperparamBuilder.py rename to core/src/main/python/mmlspark/automl/HyperparamBuilder.py diff --git a/src/main/python/mmlspark/automl/TuneHyperparametersModel.py b/core/src/main/python/mmlspark/automl/TuneHyperparametersModel.py similarity index 100% rename from src/main/python/mmlspark/automl/TuneHyperparametersModel.py rename to core/src/main/python/mmlspark/automl/TuneHyperparametersModel.py diff --git a/src/main/python/mmlspark/automl/__init__.py b/core/src/main/python/mmlspark/automl/__init__.py similarity index 100% rename from src/main/python/mmlspark/automl/__init__.py rename to core/src/main/python/mmlspark/automl/__init__.py diff --git a/src/main/python/mmlspark/cntk/__init__.py b/core/src/main/python/mmlspark/core/__init__.py similarity index 100% rename from src/main/python/mmlspark/cntk/__init__.py rename to core/src/main/python/mmlspark/core/__init__.py diff --git a/src/main/python/mmlspark/core/schema/TypeConversionUtils.py b/core/src/main/python/mmlspark/core/schema/TypeConversionUtils.py similarity index 100% rename from src/main/python/mmlspark/core/schema/TypeConversionUtils.py rename to core/src/main/python/mmlspark/core/schema/TypeConversionUtils.py diff --git a/src/main/python/mmlspark/core/schema/Utils.py b/core/src/main/python/mmlspark/core/schema/Utils.py similarity index 100% rename from src/main/python/mmlspark/core/schema/Utils.py rename to core/src/main/python/mmlspark/core/schema/Utils.py diff --git a/src/main/python/mmlspark/cognitive/__init__.py b/core/src/main/python/mmlspark/core/schema/__init__.py similarity index 100% rename from src/main/python/mmlspark/cognitive/__init__.py rename to core/src/main/python/mmlspark/core/schema/__init__.py diff --git a/src/main/python/mmlspark/core/__init__.py b/core/src/main/python/mmlspark/core/serialize/__init__.py similarity index 100% rename from src/main/python/mmlspark/core/__init__.py rename to core/src/main/python/mmlspark/core/serialize/__init__.py diff --git a/src/main/python/mmlspark/core/serialize/java_params_patch.py b/core/src/main/python/mmlspark/core/serialize/java_params_patch.py similarity index 100% rename from src/main/python/mmlspark/core/serialize/java_params_patch.py rename to core/src/main/python/mmlspark/core/serialize/java_params_patch.py diff --git a/src/main/python/mmlspark/core/spark/FluentAPI.py b/core/src/main/python/mmlspark/core/spark/FluentAPI.py similarity index 100% rename from src/main/python/mmlspark/core/spark/FluentAPI.py rename to core/src/main/python/mmlspark/core/spark/FluentAPI.py diff --git a/src/main/python/mmlspark/core/schema/__init__.py b/core/src/main/python/mmlspark/core/spark/__init__.py similarity index 100% rename from src/main/python/mmlspark/core/schema/__init__.py rename to core/src/main/python/mmlspark/core/spark/__init__.py diff --git a/src/main/python/mmlspark/core/serialize/__init__.py b/core/src/main/python/mmlspark/cyber/__init__.py similarity index 100% rename from src/main/python/mmlspark/core/serialize/__init__.py rename to core/src/main/python/mmlspark/cyber/__init__.py diff --git a/src/main/python/mmlspark/core/spark/__init__.py b/core/src/main/python/mmlspark/cyber/anomaly/__init__.py similarity index 100% rename from src/main/python/mmlspark/core/spark/__init__.py rename to core/src/main/python/mmlspark/cyber/anomaly/__init__.py diff --git a/src/main/python/mmlspark/cyber/anomaly/collaborative_filtering.py b/core/src/main/python/mmlspark/cyber/anomaly/collaborative_filtering.py similarity index 100% rename from src/main/python/mmlspark/cyber/anomaly/collaborative_filtering.py rename to core/src/main/python/mmlspark/cyber/anomaly/collaborative_filtering.py diff --git a/src/main/python/mmlspark/cyber/anomaly/complement_access.py b/core/src/main/python/mmlspark/cyber/anomaly/complement_access.py similarity index 100% rename from src/main/python/mmlspark/cyber/anomaly/complement_access.py rename to core/src/main/python/mmlspark/cyber/anomaly/complement_access.py diff --git a/src/main/python/mmlspark/cyber/dataset.py b/core/src/main/python/mmlspark/cyber/dataset.py similarity index 100% rename from src/main/python/mmlspark/cyber/dataset.py rename to core/src/main/python/mmlspark/cyber/dataset.py diff --git a/src/main/python/mmlspark/cyber/__init__.py b/core/src/main/python/mmlspark/cyber/feature/__init__.py similarity index 100% rename from src/main/python/mmlspark/cyber/__init__.py rename to core/src/main/python/mmlspark/cyber/feature/__init__.py diff --git a/src/main/python/mmlspark/cyber/feature/indexers.py b/core/src/main/python/mmlspark/cyber/feature/indexers.py similarity index 100% rename from src/main/python/mmlspark/cyber/feature/indexers.py rename to core/src/main/python/mmlspark/cyber/feature/indexers.py diff --git a/src/main/python/mmlspark/cyber/feature/scalers.py b/core/src/main/python/mmlspark/cyber/feature/scalers.py similarity index 100% rename from src/main/python/mmlspark/cyber/feature/scalers.py rename to core/src/main/python/mmlspark/cyber/feature/scalers.py diff --git a/src/main/python/mmlspark/cyber/anomaly/__init__.py b/core/src/main/python/mmlspark/cyber/utils/__init__.py similarity index 100% rename from src/main/python/mmlspark/cyber/anomaly/__init__.py rename to core/src/main/python/mmlspark/cyber/utils/__init__.py diff --git a/src/main/python/mmlspark/cyber/utils/spark_utils.py b/core/src/main/python/mmlspark/cyber/utils/spark_utils.py similarity index 100% rename from src/main/python/mmlspark/cyber/utils/spark_utils.py rename to core/src/main/python/mmlspark/cyber/utils/spark_utils.py diff --git a/src/main/python/mmlspark/doc/conf.py b/core/src/main/python/mmlspark/doc/conf.py similarity index 100% rename from src/main/python/mmlspark/doc/conf.py rename to core/src/main/python/mmlspark/doc/conf.py diff --git a/src/main/python/mmlspark/doc/index.rst b/core/src/main/python/mmlspark/doc/index.rst similarity index 100% rename from src/main/python/mmlspark/doc/index.rst rename to core/src/main/python/mmlspark/doc/index.rst diff --git a/src/main/python/mmlspark/doc/scala.rst b/core/src/main/python/mmlspark/doc/scala.rst similarity index 100% rename from src/main/python/mmlspark/doc/scala.rst rename to core/src/main/python/mmlspark/doc/scala.rst diff --git a/src/main/python/mmlspark/downloader/ModelDownloader.py b/core/src/main/python/mmlspark/downloader/ModelDownloader.py similarity index 100% rename from src/main/python/mmlspark/downloader/ModelDownloader.py rename to core/src/main/python/mmlspark/downloader/ModelDownloader.py diff --git a/src/main/python/mmlspark/cyber/feature/__init__.py b/core/src/main/python/mmlspark/downloader/__init__.py similarity index 100% rename from src/main/python/mmlspark/cyber/feature/__init__.py rename to core/src/main/python/mmlspark/downloader/__init__.py diff --git a/src/main/python/mmlspark/io/IOImplicits.py b/core/src/main/python/mmlspark/io/IOImplicits.py similarity index 100% rename from src/main/python/mmlspark/io/IOImplicits.py rename to core/src/main/python/mmlspark/io/IOImplicits.py diff --git a/src/main/python/mmlspark/cyber/utils/__init__.py b/core/src/main/python/mmlspark/io/__init__.py similarity index 100% rename from src/main/python/mmlspark/cyber/utils/__init__.py rename to core/src/main/python/mmlspark/io/__init__.py diff --git a/src/main/python/mmlspark/io/binary/BinaryFileReader.py b/core/src/main/python/mmlspark/io/binary/BinaryFileReader.py similarity index 100% rename from src/main/python/mmlspark/io/binary/BinaryFileReader.py rename to core/src/main/python/mmlspark/io/binary/BinaryFileReader.py diff --git a/src/main/python/mmlspark/downloader/__init__.py b/core/src/main/python/mmlspark/io/binary/__init__.py similarity index 100% rename from src/main/python/mmlspark/downloader/__init__.py rename to core/src/main/python/mmlspark/io/binary/__init__.py diff --git a/src/main/python/mmlspark/io/http/HTTPFunctions.py b/core/src/main/python/mmlspark/io/http/HTTPFunctions.py similarity index 100% rename from src/main/python/mmlspark/io/http/HTTPFunctions.py rename to core/src/main/python/mmlspark/io/http/HTTPFunctions.py diff --git a/src/main/python/mmlspark/io/http/JSONOutputParser.py b/core/src/main/python/mmlspark/io/http/JSONOutputParser.py similarity index 100% rename from src/main/python/mmlspark/io/http/JSONOutputParser.py rename to core/src/main/python/mmlspark/io/http/JSONOutputParser.py diff --git a/src/main/python/mmlspark/io/http/ServingFunctions.py b/core/src/main/python/mmlspark/io/http/ServingFunctions.py similarity index 100% rename from src/main/python/mmlspark/io/http/ServingFunctions.py rename to core/src/main/python/mmlspark/io/http/ServingFunctions.py diff --git a/src/main/python/mmlspark/io/http/SimpleHTTPTransformer.py b/core/src/main/python/mmlspark/io/http/SimpleHTTPTransformer.py similarity index 100% rename from src/main/python/mmlspark/io/http/SimpleHTTPTransformer.py rename to core/src/main/python/mmlspark/io/http/SimpleHTTPTransformer.py diff --git a/src/main/python/mmlspark/image/__init__.py b/core/src/main/python/mmlspark/io/http/__init__.py similarity index 100% rename from src/main/python/mmlspark/image/__init__.py rename to core/src/main/python/mmlspark/io/http/__init__.py diff --git a/src/main/python/mmlspark/io/image/ImageUtils.py b/core/src/main/python/mmlspark/io/image/ImageUtils.py similarity index 100% rename from src/main/python/mmlspark/io/image/ImageUtils.py rename to core/src/main/python/mmlspark/io/image/ImageUtils.py diff --git a/src/main/python/mmlspark/io/__init__.py b/core/src/main/python/mmlspark/io/image/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/__init__.py rename to core/src/main/python/mmlspark/io/image/__init__.py diff --git a/src/main/python/mmlspark/io/powerbi/PowerBIWriter.py b/core/src/main/python/mmlspark/io/powerbi/PowerBIWriter.py similarity index 100% rename from src/main/python/mmlspark/io/powerbi/PowerBIWriter.py rename to core/src/main/python/mmlspark/io/powerbi/PowerBIWriter.py diff --git a/src/main/python/mmlspark/io/binary/__init__.py b/core/src/main/python/mmlspark/io/powerbi/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/binary/__init__.py rename to core/src/main/python/mmlspark/io/powerbi/__init__.py diff --git a/src/main/python/mmlspark/nn/ConditionalBallTree.py b/core/src/main/python/mmlspark/nn/ConditionalBallTree.py similarity index 100% rename from src/main/python/mmlspark/nn/ConditionalBallTree.py rename to core/src/main/python/mmlspark/nn/ConditionalBallTree.py diff --git a/src/main/python/mmlspark/io/http/__init__.py b/core/src/main/python/mmlspark/nn/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/http/__init__.py rename to core/src/main/python/mmlspark/nn/__init__.py diff --git a/src/main/python/mmlspark/io/image/__init__.py b/core/src/main/python/mmlspark/plot/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/image/__init__.py rename to core/src/main/python/mmlspark/plot/__init__.py diff --git a/src/main/python/mmlspark/plot/plot.py b/core/src/main/python/mmlspark/plot/plot.py similarity index 100% rename from src/main/python/mmlspark/plot/plot.py rename to core/src/main/python/mmlspark/plot/plot.py diff --git a/src/main/python/mmlspark/recommendation/RankingTrainValidationSplit.py b/core/src/main/python/mmlspark/recommendation/RankingTrainValidationSplit.py similarity index 100% rename from src/main/python/mmlspark/recommendation/RankingTrainValidationSplit.py rename to core/src/main/python/mmlspark/recommendation/RankingTrainValidationSplit.py diff --git a/src/main/python/mmlspark/recommendation/RankingTrainValidationSplitModel.py b/core/src/main/python/mmlspark/recommendation/RankingTrainValidationSplitModel.py similarity index 100% rename from src/main/python/mmlspark/recommendation/RankingTrainValidationSplitModel.py rename to core/src/main/python/mmlspark/recommendation/RankingTrainValidationSplitModel.py diff --git a/src/main/python/mmlspark/recommendation/SARModel.py b/core/src/main/python/mmlspark/recommendation/SARModel.py similarity index 100% rename from src/main/python/mmlspark/recommendation/SARModel.py rename to core/src/main/python/mmlspark/recommendation/SARModel.py diff --git a/src/main/python/mmlspark/recommendation/__init__.py b/core/src/main/python/mmlspark/recommendation/__init__.py similarity index 100% rename from src/main/python/mmlspark/recommendation/__init__.py rename to core/src/main/python/mmlspark/recommendation/__init__.py diff --git a/src/main/python/mmlspark/stages/UDFTransformer.py b/core/src/main/python/mmlspark/stages/UDFTransformer.py similarity index 100% rename from src/main/python/mmlspark/stages/UDFTransformer.py rename to core/src/main/python/mmlspark/stages/UDFTransformer.py diff --git a/src/main/python/mmlspark/io/powerbi/__init__.py b/core/src/main/python/mmlspark/stages/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/powerbi/__init__.py rename to core/src/main/python/mmlspark/stages/__init__.py diff --git a/src/main/scala/com/microsoft/ml/spark/automl/DefaultHyperparams.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/DefaultHyperparams.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/DefaultHyperparams.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/DefaultHyperparams.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/EvaluationUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/EvaluationUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/EvaluationUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/EvaluationUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.txt b/core/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.txt rename to core/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.txt diff --git a/src/main/scala/com/microsoft/ml/spark/automl/HyperparamBuilder.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/HyperparamBuilder.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/HyperparamBuilder.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/HyperparamBuilder.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/ParamSpace.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/ParamSpace.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/ParamSpace.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/ParamSpace.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.txt b/core/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.txt rename to core/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.txt diff --git a/core/src/main/scala/com/microsoft/ml/spark/codegen/CodeGen.scala b/core/src/main/scala/com/microsoft/ml/spark/codegen/CodeGen.scala new file mode 100644 index 00000000000..bd88735e5f0 --- /dev/null +++ b/core/src/main/scala/com/microsoft/ml/spark/codegen/CodeGen.scala @@ -0,0 +1,202 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.codegen + +import java.io.File + +import com.microsoft.ml.spark.codegen.CodegenConfigProtocol._ +import com.microsoft.ml.spark.core.env.FileUtilities._ +import org.apache.commons.io.FileUtils +import org.apache.commons.io.FilenameUtils._ +import com.microsoft.ml.spark.core.utils.JarLoadingUtils.instantiateServices +import spray.json._ + +object CodeGenUtils { + def clean(dir: File): Unit = if (dir.exists()) FileUtils.forceDelete(dir) + + def toDir(f: File): File = new File(f, File.separator) +} + + +object CodeGen { + + import CodeGenUtils._ + + def generatePythonClasses(conf: CodegenConfig): Unit = { + val instantiatedClasses = instantiateServices[PythonWrappable](conf.jarName) + instantiatedClasses.foreach { w => + println(w.getClass.getName) + w.makePyFile(conf) + } + } + + def generateRClasses(conf: CodegenConfig): Unit = { + val instantiatedClasses = instantiateServices[RWrappable](conf.jarName) + instantiatedClasses.foreach { w => + println(w.getClass.getName) + w.makeRFile(conf) + } + } + + private def makeInitFiles(conf: CodegenConfig, packageFolder: String = ""): Unit = { + val dir = new File(new File(conf.pySrcDir, "mmlspark"), packageFolder) + val packageString = if (packageFolder != "") packageFolder.replace("/", ".") else "" + val importStrings = + dir.listFiles.filter(_.isFile).sorted + .map(_.getName) + .filter(name => name.endsWith(".py") && !name.startsWith("_") && !name.startsWith("test")) + .map(name => s"from mmlspark$packageString.${getBaseName(name)} import *\n").mkString("") + val initFile = new File(dir, "__init__.py") + if (packageFolder != "") { + writeFile(initFile, conf.packageHelp(importStrings)) + } else if (initFile.exists()) { + initFile.delete() + } + dir.listFiles().filter(_.isDirectory).foreach(f => + makeInitFiles(conf, packageFolder + "/" + f.getName) + ) + } + + //noinspection ScalaStyle + def generateRPackageData(conf: CodegenConfig): Unit = { + // description file; need to encode version as decimal + val today = new java.text.SimpleDateFormat("yyyy-MM-dd") + .format(new java.util.Date()) + + conf.rSrcDir.mkdirs() + writeFile(new File(conf.rSrcDir.getParentFile, "DESCRIPTION"), + s"""|Package: ${conf.name.replace("-",".")} + |Title: Access to MMLSpark via R + |Description: Provides an interface to MMLSpark. + |Version: ${conf.rVersion} + |Date: $today + |Author: Microsoft Corporation + |Maintainer: MMLSpark Team + |URL: https://github.com/Azure/mmlspark + |BugReports: https://github.com/Azure/mmlspark/issues + |Depends: + | R (>= 2.12.0) + |Imports: + | sparklyr + |License: MIT + |Suggests: + | testthat (>= 3.0.0) + |Config/testthat/edition: 3 + |""".stripMargin) + + writeFile(new File(conf.rSrcDir, "package_register.R"), + s"""|#' @import sparklyr + |spark_dependencies <- function(spark_version, scala_version, ...) { + | spark_dependency( + | jars = c(), + | packages = c( + | "com.microsoft.ml.spark:${conf.name}:${conf.version}" + | ), + | repositories = c("https://mmlspark.azureedge.net/maven") + | ) + |} + | + |#' @import sparklyr + |.onLoad <- function(libname, pkgname) { + | sparklyr::register_extension(pkgname) + |} + |""".stripMargin) + + writeFile(new File(conf.rSrcDir.getParentFile, "mmlspark.Rproj"), + """ + |Version: 1.0 + | + |RestoreWorkspace: Default + |SaveWorkspace: Default + |AlwaysSaveHistory: Default + | + |EnableCodeIndexing: Yes + |UseSpacesForTab: Yes + |NumSpacesForTab: 4 + |Encoding: UTF-8 + | + |RnwWeave: Sweave + |LaTeX: pdfLaTeX + | + |BuildType: Package + |PackageUseDevtools: Yes + |PackageInstallArgs: --no-multiarch --with-keep.source + | + |""".stripMargin) + + } + + //noinspection ScalaStyle + def generatePyPackageData(conf: CodegenConfig): Unit = { + if (!conf.pySrcDir.exists()) { + conf.pySrcDir.mkdir() + } + writeFile(join(conf.pySrcDir, "setup.py"), + s""" + |# Copyright (C) Microsoft Corporation. All rights reserved. + |# Licensed under the MIT License. See LICENSE in project root for information. + | + |import os + |from setuptools import setup, find_namespace_packages + |import codecs + |import os.path + | + |setup( + | name="${conf.name}", + | version="${conf.pythonizedVersion}", + | description="Microsoft ML for Spark", + | long_description="Microsoft ML for Apache Spark contains Microsoft's open source " + | + "contributions to the Apache Spark ecosystem", + | license="MIT", + | packages=find_namespace_packages(include=['mmlspark.*']), + | url="https://github.com/Azure/mmlspark", + | author="Microsoft", + | author_email="mmlspark-support@microsoft.com", + | classifiers=[ + | "Development Status :: 4 - Beta", + | "Intended Audience :: Developers", + | "Intended Audience :: Data Scientists", + | "Topic :: Software Development :: Datascience Tools", + | "License :: OSI Approved :: MIT License", + | "Programming Language :: Python :: 2", + | "Programming Language :: Python :: 3", + | ], + | zip_safe=True, + | package_data={"mmlspark": ["../LICENSE.txt", "../README.txt"]}, + |) + | + |""".stripMargin) + } + + + def rGen(conf: CodegenConfig): Unit = { + println(s"Generating R for ${conf.jarName}") + clean(conf.rSrcRoot) + generateRPackageData(conf) + generateRClasses(conf) + if (conf.rSrcOverrideDir.exists()) + FileUtils.copyDirectoryToDirectory(toDir(conf.rSrcOverrideDir), toDir(conf.rSrcDir)) + if (conf.rTestOverrideDir.exists()) + FileUtils.copyDirectoryToDirectory(toDir(conf.rTestOverrideDir), toDir(conf.rTestDir)) + } + + def pyGen(conf: CodegenConfig): Unit = { + println(s"Generating python for ${conf.jarName}") + clean(conf.pySrcDir) + generatePyPackageData(conf) + generatePythonClasses(conf) + if (conf.pySrcOverrideDir.exists()) + FileUtils.copyDirectoryToDirectory(toDir(conf.pySrcOverrideDir), toDir(conf.pySrcDir)) + makeInitFiles(conf) + } + + def main(args: Array[String]): Unit = { + val conf = args.head.parseJson.convertTo[CodegenConfig] + clean(conf.packageDir) + rGen(conf) + pyGen(conf) + } + +} + diff --git a/core/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala b/core/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala new file mode 100644 index 00000000000..049eb1bb8f9 --- /dev/null +++ b/core/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala @@ -0,0 +1,80 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.codegen + +import java.io.File + +import spray.json.{DefaultJsonProtocol, RootJsonFormat} + + +case class CodegenConfig(name: String, + jarName: Option[String], + topDir: String, + targetDir: String, + version: String, + pythonizedVersion: String, + rVersion: String, + packageName: String) { + def generatedDir: File = new File(targetDir, "generated") + def packageDir: File = new File(generatedDir, "package") + def srcDir: File = new File(generatedDir, "src") + def testDir: File = new File(generatedDir, "test") + def docDir: File = new File(generatedDir, "doc") + def testDataDir: File = new File(generatedDir, "test-data") + + //Python Codegen Constant + def pySrcDir: File = new File(srcDir, "python") + def pyPackageDir: File = new File(packageDir, "python") + def pyTestDir: File = new File(testDir, "python") + def pySrcOverrideDir: File = new File(topDir, "src/main/python") + def pyTestOverrideDir: File = new File(topDir, "src/test/python") + + //R Codegen Constants + def rSrcRoot: File = new File(srcDir, "R") + def rSrcDir: File = new File(rSrcRoot, "mmlspark/R") + def rPackageDir: File = new File(packageDir, "R") + def rTestDir: File = new File(rSrcRoot, "mmlspark/tests") + def rTestOverrideDir: File = new File(topDir, "src/test/R") + def rSrcOverrideDir: File = new File(topDir, "src/main/R") + + //val rPackageFile = new File(rPackageDir, s"mmlspark-$mmlVer.zip") + def internalPrefix: String = "_" + def scopeDepth: String = " " * 4 + + def copyrightLines: String = + s"""|# Copyright (C) Microsoft Corporation. All rights reserved. + |# Licensed under the MIT License. See LICENSE in project root for information. + |""".stripMargin + + // The __init__.py file + def packageHelp(importString: String): String = { + s"""|$copyrightLines + | + |"\"" + |MMLSpark is an ecosystem of tools aimed towards expanding the distributed computing framework + |Apache Spark in several new directions. MMLSpark adds many deep learning and data science tools to the Spark + |ecosystem, including seamless integration of Spark Machine Learning pipelines with + |Microsoft Cognitive Toolkit (CNTK), LightGBM and OpenCV. These tools enable powerful and + |highly-scalable predictive and analytical models for a variety of datasources. + | + |MMLSpark also brings new networking capabilities to the Spark Ecosystem. With the HTTP on Spark project, + |users can embed any web service into their SparkML models. In this vein, MMLSpark provides easy to use SparkML + |transformers for a wide variety of Microsoft Cognitive Services. For production grade deployment, + |the Spark Serving project enables high throughput, sub-millisecond latency web services, + |backed by your Spark cluster. + | + |MMLSpark requires Scala 2.11, Spark 2.4+, and Python 3.5+. + |"\"" + | + |__version__ = "$pythonizedVersion" + |__spark_package_version__ = "$version" + | + |$importString + |""".stripMargin + } +} + +object CodegenConfigProtocol extends DefaultJsonProtocol { + implicit val CCFormat: RootJsonFormat[CodegenConfig] = jsonFormat8(CodegenConfig.apply) +} diff --git a/src/main/scala/com/microsoft/ml/spark/codegen/GenerationUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/codegen/GenerationUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/codegen/GenerationUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/codegen/GenerationUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/codegen/Wrappable.scala b/core/src/main/scala/com/microsoft/ml/spark/codegen/Wrappable.scala similarity index 98% rename from src/main/scala/com/microsoft/ml/spark/codegen/Wrappable.scala rename to core/src/main/scala/com/microsoft/ml/spark/codegen/Wrappable.scala index a8d619264b5..3bae3fa8fc9 100644 --- a/src/main/scala/com/microsoft/ml/spark/codegen/Wrappable.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/codegen/Wrappable.scala @@ -379,11 +379,11 @@ trait PythonWrappable extends BaseWrappable { """.stripMargin } - def makePyFile(): Unit = { + def makePyFile(conf: CodegenConfig): Unit = { val importPath = this.getClass.getName.split(".".toCharArray).dropRight(1) val srcFolders = importPath.mkString(".") .replaceAllLiterally("com.microsoft.ml.spark", "mmlspark").split(".".toCharArray) - val srcDir = FileUtilities.join((Seq(Config.PySrcDir.toString) ++ srcFolders.toSeq): _*) + val srcDir = FileUtilities.join((Seq(conf.pySrcDir.toString) ++ srcFolders.toSeq): _*) srcDir.mkdirs() Files.write( FileUtilities.join(srcDir, pyClassName + ".py").toPath, @@ -504,10 +504,10 @@ trait RWrappable extends BaseWrappable { } - def makeRFile(): Unit = { - Config.RSrcDir.mkdirs() + def makeRFile(conf: CodegenConfig): Unit = { + conf.rSrcDir.mkdirs() Files.write( - FileUtilities.join(Config.RSrcDir, rFuncName + ".R").toPath, + FileUtilities.join(conf.rSrcDir, rFuncName + ".R").toPath, rClass().getBytes(StandardCharsets.UTF_8)) } diff --git a/src/main/scala/com/microsoft/ml/spark/core/contracts/Metrics.scala b/core/src/main/scala/com/microsoft/ml/spark/core/contracts/Metrics.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/contracts/Metrics.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/contracts/Metrics.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/contracts/Params.scala b/core/src/main/scala/com/microsoft/ml/spark/core/contracts/Params.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/contracts/Params.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/contracts/Params.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/env/FileUtilities.scala b/core/src/main/scala/com/microsoft/ml/spark/core/env/FileUtilities.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/env/FileUtilities.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/env/FileUtilities.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/env/NativeLoader.java b/core/src/main/scala/com/microsoft/ml/spark/core/env/NativeLoader.java similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/env/NativeLoader.java rename to core/src/main/scala/com/microsoft/ml/spark/core/env/NativeLoader.java diff --git a/src/main/scala/com/microsoft/ml/spark/core/env/StreamUtilities.scala b/core/src/main/scala/com/microsoft/ml/spark/core/env/StreamUtilities.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/env/StreamUtilities.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/env/StreamUtilities.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/metrics/MetricConstants.scala b/core/src/main/scala/com/microsoft/ml/spark/core/metrics/MetricConstants.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/metrics/MetricConstants.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/metrics/MetricConstants.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/metrics/MetricUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/metrics/MetricUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/metrics/MetricUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/metrics/MetricUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/BinaryFileSchema.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/BinaryFileSchema.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/BinaryFileSchema.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/BinaryFileSchema.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/Categoricals.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/Categoricals.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/Categoricals.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/Categoricals.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/DatasetExtensions.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/DatasetExtensions.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/DatasetExtensions.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/DatasetExtensions.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/ImageSchemaUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/ImageSchemaUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/ImageSchemaUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/ImageSchemaUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/SchemaConstants.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/SchemaConstants.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/SchemaConstants.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/SchemaConstants.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/SparkBindings.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/SparkBindings.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/SparkBindings.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/SparkBindings.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/SparkSchema.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/SparkSchema.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/SparkSchema.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/SparkSchema.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/serialize/ComplexParam.scala b/core/src/main/scala/com/microsoft/ml/spark/core/serialize/ComplexParam.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/serialize/ComplexParam.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/serialize/ComplexParam.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/spark/FluentAPI.scala b/core/src/main/scala/com/microsoft/ml/spark/core/spark/FluentAPI.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/spark/FluentAPI.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/spark/FluentAPI.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/AsyncUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/AsyncUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/AsyncUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/AsyncUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/CastUtilities.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/CastUtilities.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/CastUtilities.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/CastUtilities.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala similarity index 99% rename from src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala index 297dba1de68..db8e39cd033 100644 --- a/src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala @@ -6,7 +6,7 @@ package com.microsoft.ml.spark.core.utils import java.net.InetAddress import org.apache.http.conn.util.InetAddressUtils -import org.apache.spark.lightgbm.BlockManagerUtils +import org.apache.spark.injections.BlockManagerUtils import org.apache.spark.sql.{Dataset, SparkSession} import org.slf4j.Logger diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/ContextObjectInputStream.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/ContextObjectInputStream.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/ContextObjectInputStream.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/ContextObjectInputStream.scala diff --git a/core/src/main/scala/com/microsoft/ml/spark/core/utils/FaultToleranceUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/FaultToleranceUtils.scala new file mode 100644 index 00000000000..7bbe1b1d0c8 --- /dev/null +++ b/core/src/main/scala/com/microsoft/ml/spark/core/utils/FaultToleranceUtils.scala @@ -0,0 +1,33 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.core.utils + +import scala.concurrent.duration.Duration +import scala.concurrent.{Await, ExecutionContext, Future} + +object FaultToleranceUtils { + def retryWithTimeout[T](times: Int, timeout: Duration)(f: => T): T ={ + try { + Await.result(Future(f)(ExecutionContext.global), timeout) + } catch { + case e: Exception if times >= 1 => + print(s"Received exception on call, retrying: $e") + retryWithTimeout(times-1, timeout)(f) + } + } + + val Backoffs: Seq[Int] = Seq(0, 100, 200, 500) + + def retryWithTimeout[T](times: Seq[Int] = Backoffs)(f: => T): T ={ + try { + f + } catch { + case e: Exception if times.nonEmpty => + println(s"Received exception on call, retrying: $e") + Thread.sleep(times.head) + retryWithTimeout(times.tail)(f) + } + } + +} diff --git a/src/test/scala/com/microsoft/ml/spark/core/utils/JarLoadingUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/JarLoadingUtils.scala similarity index 72% rename from src/test/scala/com/microsoft/ml/spark/core/utils/JarLoadingUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/JarLoadingUtils.scala index dba98c4a595..478631f6209 100644 --- a/src/test/scala/com/microsoft/ml/spark/core/utils/JarLoadingUtils.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/core/utils/JarLoadingUtils.scala @@ -7,9 +7,8 @@ import java.lang.reflect.Modifier import com.microsoft.ml.spark.codegen.Wrappable import org.sparkproject.guava.reflect.ClassPath - import scala.collection.JavaConverters._ -import scala.reflect.{ClassTag, _} +import scala.reflect.{ClassTag, classTag} /** Contains logic for loading classes. */ object JarLoadingUtils { @@ -41,22 +40,25 @@ object JarLoadingUtils { AllClasses.filter(classOf[Wrappable].isAssignableFrom(_)) } - def instantiateServices[T: ClassTag](instantiate: Class[_] => Any): List[T] = { + def instantiateServices[T: ClassTag](instantiate: Class[_] => Any, jarName: Option[String]): List[T] = { AllClasses .filter(classTag[T].runtimeClass.isAssignableFrom(_)) + .filter(c => jarName.forall(c.getResource(c.getSimpleName + ".class").toString.contains(_))) .filter(clazz => !Modifier.isAbstract(clazz.getModifiers)) .map(instantiate(_)).asInstanceOf[List[T]] } - def instantiateServices[T: ClassTag]: List[T] = instantiateServices[T] { + def instantiateServices[T: ClassTag](jarName: Option[String] = None): List[T] = instantiateServices[T]({ clazz: Class[_] => clazz.getConstructor().newInstance() - } + }, jarName) - def instantiateObjects[T: ClassTag]: List[T] = instantiateServices[T] { clazz: Class[_] => { - val cons = clazz.getDeclaredConstructors()(0) - cons.setAccessible(true) - cons.newInstance() - }} + def instantiateObjects[T: ClassTag](jarName: Option[String] = None): List[T] = instantiateServices[T]( + { clazz: Class[_] => { + val cons = clazz.getDeclaredConstructors()(0) + cons.setAccessible(true) + cons.newInstance() + } + }, + jarName) } - diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/ModelEquality.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/ModelEquality.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/ModelEquality.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/ModelEquality.scala diff --git a/core/src/main/scala/com/microsoft/ml/spark/core/utils/OsUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/OsUtils.scala new file mode 100644 index 00000000000..80c4560fe89 --- /dev/null +++ b/core/src/main/scala/com/microsoft/ml/spark/core/utils/OsUtils.scala @@ -0,0 +1,8 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.core.utils + +object OsUtils { + val IsWindows: Boolean = System.getProperty("os.name").toLowerCase().indexOf("win") >= 0 +} diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/RowUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/RowUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/RowUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/RowUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/SlicerFunctions.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/SlicerFunctions.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/SlicerFunctions.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/SlicerFunctions.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/StopWatch.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/StopWatch.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/StopWatch.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/StopWatch.scala diff --git a/src/main/scala/com/microsoft/ml/spark/explainers/BreezeUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/BreezeUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/explainers/BreezeUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/explainers/BreezeUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/explainers/FeatureStats.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/FeatureStats.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/explainers/FeatureStats.scala rename to core/src/main/scala/com/microsoft/ml/spark/explainers/FeatureStats.scala diff --git a/src/main/scala/com/microsoft/ml/spark/explainers/ImageExplainer.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/ImageExplainer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/explainers/ImageExplainer.scala rename to core/src/main/scala/com/microsoft/ml/spark/explainers/ImageExplainer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/explainers/ImageLIME.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/ImageLIME.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/explainers/ImageLIME.scala rename to core/src/main/scala/com/microsoft/ml/spark/explainers/ImageLIME.scala diff --git a/src/main/scala/com/microsoft/ml/spark/explainers/ImageSHAP.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/ImageSHAP.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/explainers/ImageSHAP.scala rename to core/src/main/scala/com/microsoft/ml/spark/explainers/ImageSHAP.scala diff --git a/src/main/scala/com/microsoft/ml/spark/explainers/KernelSHAPBase.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/KernelSHAPBase.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/explainers/KernelSHAPBase.scala rename to core/src/main/scala/com/microsoft/ml/spark/explainers/KernelSHAPBase.scala diff --git a/src/main/scala/com/microsoft/ml/spark/explainers/KernelSHAPSampler.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/KernelSHAPSampler.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/explainers/KernelSHAPSampler.scala rename to core/src/main/scala/com/microsoft/ml/spark/explainers/KernelSHAPSampler.scala diff --git a/src/main/scala/com/microsoft/ml/spark/explainers/LIMEBase.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/LIMEBase.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/explainers/LIMEBase.scala rename to core/src/main/scala/com/microsoft/ml/spark/explainers/LIMEBase.scala diff --git a/src/main/scala/com/microsoft/ml/spark/explainers/LIMESampler.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/LIMESampler.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/explainers/LIMESampler.scala rename to core/src/main/scala/com/microsoft/ml/spark/explainers/LIMESampler.scala diff --git a/src/main/scala/com/microsoft/ml/spark/explainers/LassoRegression.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/LassoRegression.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/explainers/LassoRegression.scala rename to core/src/main/scala/com/microsoft/ml/spark/explainers/LassoRegression.scala diff --git a/src/main/scala/com/microsoft/ml/spark/explainers/LeastSquaresRegression.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/LeastSquaresRegression.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/explainers/LeastSquaresRegression.scala rename to core/src/main/scala/com/microsoft/ml/spark/explainers/LeastSquaresRegression.scala diff --git a/src/main/scala/com/microsoft/ml/spark/explainers/LocalExplainer.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/LocalExplainer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/explainers/LocalExplainer.scala rename to core/src/main/scala/com/microsoft/ml/spark/explainers/LocalExplainer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/explainers/RegressionBase.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/RegressionBase.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/explainers/RegressionBase.scala rename to core/src/main/scala/com/microsoft/ml/spark/explainers/RegressionBase.scala diff --git a/src/main/scala/com/microsoft/ml/spark/explainers/RowUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/RowUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/explainers/RowUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/explainers/RowUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/explainers/Sampler.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/Sampler.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/explainers/Sampler.scala rename to core/src/main/scala/com/microsoft/ml/spark/explainers/Sampler.scala diff --git a/src/main/scala/com/microsoft/ml/spark/explainers/SharedParams.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/SharedParams.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/explainers/SharedParams.scala rename to core/src/main/scala/com/microsoft/ml/spark/explainers/SharedParams.scala diff --git a/src/main/scala/com/microsoft/ml/spark/explainers/TabularLIME.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/TabularLIME.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/explainers/TabularLIME.scala rename to core/src/main/scala/com/microsoft/ml/spark/explainers/TabularLIME.scala diff --git a/src/main/scala/com/microsoft/ml/spark/explainers/TabularSHAP.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/TabularSHAP.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/explainers/TabularSHAP.scala rename to core/src/main/scala/com/microsoft/ml/spark/explainers/TabularSHAP.scala diff --git a/src/main/scala/com/microsoft/ml/spark/explainers/TextExplainer.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/TextExplainer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/explainers/TextExplainer.scala rename to core/src/main/scala/com/microsoft/ml/spark/explainers/TextExplainer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/explainers/TextLIME.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/TextLIME.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/explainers/TextLIME.scala rename to core/src/main/scala/com/microsoft/ml/spark/explainers/TextLIME.scala diff --git a/src/main/scala/com/microsoft/ml/spark/explainers/TextSHAP.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/TextSHAP.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/explainers/TextSHAP.scala rename to core/src/main/scala/com/microsoft/ml/spark/explainers/TextSHAP.scala diff --git a/src/main/scala/com/microsoft/ml/spark/explainers/VectorLIME.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/VectorLIME.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/explainers/VectorLIME.scala rename to core/src/main/scala/com/microsoft/ml/spark/explainers/VectorLIME.scala diff --git a/src/main/scala/com/microsoft/ml/spark/explainers/VectorSHAP.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/VectorSHAP.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/explainers/VectorSHAP.scala rename to core/src/main/scala/com/microsoft/ml/spark/explainers/VectorSHAP.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.txt diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/CountSelector.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/CountSelector.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/CountSelector.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/CountSelector.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.txt diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/Featurize.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/Featurize.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/Featurize.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/Featurize.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/Featurize.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/Featurize.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/Featurize.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/Featurize.txt diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.txt diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.txt diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexerModel.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexerModel.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexerModel.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexerModel.txt diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/text/MultiNGram.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/text/MultiNGram.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/text/MultiNGram.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/text/MultiNGram.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/text/PageSplitter.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/text/PageSplitter.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/text/PageSplitter.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/text/PageSplitter.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.txt diff --git a/src/main/scala/com/microsoft/ml/spark/image/ResizeImageTransformer.scala b/core/src/main/scala/com/microsoft/ml/spark/image/ResizeImageTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/image/ResizeImageTransformer.scala rename to core/src/main/scala/com/microsoft/ml/spark/image/ResizeImageTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/image/UnrollImage.scala b/core/src/main/scala/com/microsoft/ml/spark/image/UnrollImage.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/image/UnrollImage.scala rename to core/src/main/scala/com/microsoft/ml/spark/image/UnrollImage.scala diff --git a/src/main/scala/com/microsoft/ml/spark/image/UnrollImage.txt b/core/src/main/scala/com/microsoft/ml/spark/image/UnrollImage.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/image/UnrollImage.txt rename to core/src/main/scala/com/microsoft/ml/spark/image/UnrollImage.txt diff --git a/src/main/scala/com/microsoft/ml/spark/io/IOImplicits.scala b/core/src/main/scala/com/microsoft/ml/spark/io/IOImplicits.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/IOImplicits.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/IOImplicits.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/binary/Binary.scala b/core/src/main/scala/com/microsoft/ml/spark/io/binary/Binary.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/binary/Binary.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/binary/Binary.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileFormat.scala b/core/src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileFormat.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileFormat.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileFormat.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileReader.scala b/core/src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileReader.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileReader.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileReader.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/binary/KeyValueReaderIterator.scala b/core/src/main/scala/com/microsoft/ml/spark/io/binary/KeyValueReaderIterator.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/binary/KeyValueReaderIterator.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/binary/KeyValueReaderIterator.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/Clients.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/Clients.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/Clients.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/Clients.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/HTTPClients.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/HTTPClients.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/HTTPClients.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/HTTPClients.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/HTTPSchema.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/HTTPSchema.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/HTTPSchema.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/HTTPSchema.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/HTTPTransformer.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/HTTPTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/HTTPTransformer.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/HTTPTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/Parsers.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/Parsers.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/Parsers.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/Parsers.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/PortForwarding.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/PortForwarding.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/PortForwarding.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/PortForwarding.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/SharedVariable.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/SharedVariable.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/SharedVariable.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/SharedVariable.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/SimpleHTTPTransformer.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/SimpleHTTPTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/SimpleHTTPTransformer.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/SimpleHTTPTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/image/ImageUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/io/image/ImageUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/image/ImageUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/image/ImageUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/powerbi/PowerBIWriter.scala b/core/src/main/scala/com/microsoft/ml/spark/io/powerbi/PowerBIWriter.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/powerbi/PowerBIWriter.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/powerbi/PowerBIWriter.scala diff --git a/src/main/scala/com/microsoft/ml/spark/isolationforest/IsolationForest.scala b/core/src/main/scala/com/microsoft/ml/spark/isolationforest/IsolationForest.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/isolationforest/IsolationForest.scala rename to core/src/main/scala/com/microsoft/ml/spark/isolationforest/IsolationForest.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lime/BreezeUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/lime/BreezeUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lime/BreezeUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/lime/BreezeUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lime/LIME.scala b/core/src/main/scala/com/microsoft/ml/spark/lime/LIME.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lime/LIME.scala rename to core/src/main/scala/com/microsoft/ml/spark/lime/LIME.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lime/Superpixel.scala b/core/src/main/scala/com/microsoft/ml/spark/lime/Superpixel.scala similarity index 99% rename from src/main/scala/com/microsoft/ml/spark/lime/Superpixel.scala rename to core/src/main/scala/com/microsoft/ml/spark/lime/Superpixel.scala index db01e357fef..cddb9b3b6df 100644 --- a/src/main/scala/com/microsoft/ml/spark/lime/Superpixel.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/lime/Superpixel.scala @@ -18,10 +18,8 @@ import org.apache.spark.ml.image.ImageSchema import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.expressions.UserDefinedFunction -import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{BinaryType, DataType} -import scala.collection.mutable import scala.collection.mutable.{ArrayBuffer, ListBuffer} case class SuperpixelData(clusters: Seq[Seq[(Int, Int)]]) diff --git a/src/main/scala/com/microsoft/ml/spark/lime/SuperpixelTransformer.scala b/core/src/main/scala/com/microsoft/ml/spark/lime/SuperpixelTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lime/SuperpixelTransformer.scala rename to core/src/main/scala/com/microsoft/ml/spark/lime/SuperpixelTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lime/TextLIME.scala b/core/src/main/scala/com/microsoft/ml/spark/lime/TextLIME.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lime/TextLIME.scala rename to core/src/main/scala/com/microsoft/ml/spark/lime/TextLIME.scala diff --git a/src/main/scala/com/microsoft/ml/spark/logging/BasicLogging.scala b/core/src/main/scala/com/microsoft/ml/spark/logging/BasicLogging.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/logging/BasicLogging.scala rename to core/src/main/scala/com/microsoft/ml/spark/logging/BasicLogging.scala diff --git a/src/main/scala/com/microsoft/ml/spark/nn/BallTree.scala b/core/src/main/scala/com/microsoft/ml/spark/nn/BallTree.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/nn/BallTree.scala rename to core/src/main/scala/com/microsoft/ml/spark/nn/BallTree.scala diff --git a/src/main/scala/com/microsoft/ml/spark/nn/BoundedPriorityQueue.scala b/core/src/main/scala/com/microsoft/ml/spark/nn/BoundedPriorityQueue.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/nn/BoundedPriorityQueue.scala rename to core/src/main/scala/com/microsoft/ml/spark/nn/BoundedPriorityQueue.scala diff --git a/src/main/scala/com/microsoft/ml/spark/nn/ConditionalKNN.scala b/core/src/main/scala/com/microsoft/ml/spark/nn/ConditionalKNN.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/nn/ConditionalKNN.scala rename to core/src/main/scala/com/microsoft/ml/spark/nn/ConditionalKNN.scala diff --git a/src/main/scala/com/microsoft/ml/spark/nn/KNN.scala b/core/src/main/scala/com/microsoft/ml/spark/nn/KNN.scala similarity index 98% rename from src/main/scala/com/microsoft/ml/spark/nn/KNN.scala rename to core/src/main/scala/com/microsoft/ml/spark/nn/KNN.scala index a4c3973a794..2acde7942bf 100644 --- a/src/main/scala/com/microsoft/ml/spark/nn/KNN.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/nn/KNN.scala @@ -84,7 +84,7 @@ class KNNModel(val uid: String) extends Model[KNNModel] private var broadcastedModelOption: Option[Broadcast[BallTree[_]]] = None val ballTree = new BallTreeParam(this, "ballTree", - "the ballTree model used for perfoming queries", { _ => true }) + "the ballTree model used for performing queries", { _ => true }) def getBallTree: BallTree[_] = $(ballTree) diff --git a/src/main/scala/com/microsoft/ml/spark/nn/Schemas.scala b/core/src/main/scala/com/microsoft/ml/spark/nn/Schemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/nn/Schemas.scala rename to core/src/main/scala/com/microsoft/ml/spark/nn/Schemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/RankingAdapter.scala b/core/src/main/scala/com/microsoft/ml/spark/recommendation/RankingAdapter.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/RankingAdapter.scala rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/RankingAdapter.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/RankingEvaluator.scala b/core/src/main/scala/com/microsoft/ml/spark/recommendation/RankingEvaluator.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/RankingEvaluator.scala rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/RankingEvaluator.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSplit.scala b/core/src/main/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSplit.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSplit.scala rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSplit.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexer.scala b/core/src/main/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexer.scala rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/SAR.scala b/core/src/main/scala/com/microsoft/ml/spark/recommendation/SAR.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/SAR.scala rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/SAR.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/SARModel.scala b/core/src/main/scala/com/microsoft/ml/spark/recommendation/SARModel.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/SARModel.scala rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/SARModel.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/recommendation.txt b/core/src/main/scala/com/microsoft/ml/spark/recommendation/recommendation.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/recommendation.txt rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/recommendation.txt diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Batchers.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/Batchers.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Batchers.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/Batchers.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Cacher.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/Cacher.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Cacher.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/Cacher.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/ClassBalancer.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/ClassBalancer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/ClassBalancer.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/ClassBalancer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/DropColumns.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/DropColumns.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/DropColumns.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/DropColumns.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.txt b/core/src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.txt rename to core/src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.txt diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Explode.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/Explode.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Explode.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/Explode.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Lambda.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/Lambda.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Lambda.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/Lambda.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/MiniBatchTransformer.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/MiniBatchTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/MiniBatchTransformer.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/MiniBatchTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.txt b/core/src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.txt rename to core/src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.txt diff --git a/src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala similarity index 99% rename from src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala index 0e05283c7ba..8a3cc7a0fa0 100644 --- a/src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala @@ -19,6 +19,35 @@ import scala.concurrent.blocking object PartitionConsolidator extends DefaultParamsReadable[PartitionConsolidator] +class PartitionConsolidator(val uid: String) + extends Transformer with HTTPParams with HasInputCol + with HasOutputCol + with ComplexParamsWritable with BasicLogging { + logClass() + + def this() = this(Identifiable.randomUID("PartitionConsolidator")) + + val consolidatorHolder = SharedSingleton { + new Consolidator[Row]() + } + + override def transform(dataset: Dataset[_]): DataFrame = { + logTransform[DataFrame]({ + dataset.toDF().mapPartitions { it => + if (it.hasNext) { + consolidatorHolder.get.registerAndReceive(it).flatten + } else { + Iterator() + } + }(RowEncoder(dataset.schema)) + }) + } + + override def copy(extra: ParamMap): Transformer = defaultCopy(extra) + + override def transformSchema(schema: StructType): StructType = schema +} + class Consolidator[T] { val buffer = new LinkedBlockingQueue[T]() @@ -108,36 +137,8 @@ class Consolidator[T] { } -class PartitionConsolidator(val uid: String) - extends Transformer with HTTPParams with HasInputCol - with HasOutputCol - with ComplexParamsWritable with BasicLogging { - logClass() - - def this() = this(Identifiable.randomUID("PartitionConsolidator")) - - val consolidatorHolder = SharedSingleton { - new Consolidator[Row]() - } - - override def transform(dataset: Dataset[_]): DataFrame = { - logTransform[DataFrame]({ - dataset.toDF().mapPartitions { it => - if (it.hasNext) { - consolidatorHolder.get.registerAndReceive(it).flatten - } else { - Iterator() - } - }(RowEncoder(dataset.schema)) - }) - } - - override def copy(extra: ParamMap): Transformer = defaultCopy(extra) - - override def transformSchema(schema: StructType): StructType = schema -} - trait LocalAggregator[T] { def prep(iter: Iterator[Row]): T + def merge(ts: Seq[T]): T } diff --git a/src/main/scala/com/microsoft/ml/spark/stages/RenameColumn.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/RenameColumn.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/RenameColumn.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/RenameColumn.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Repartition.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/Repartition.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Repartition.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/Repartition.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Repartition.txt b/core/src/main/scala/com/microsoft/ml/spark/stages/Repartition.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Repartition.txt rename to core/src/main/scala/com/microsoft/ml/spark/stages/Repartition.txt diff --git a/src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.txt b/core/src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.txt rename to core/src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.txt diff --git a/src/main/scala/com/microsoft/ml/spark/stages/StratifiedRepartition.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/StratifiedRepartition.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/StratifiedRepartition.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/StratifiedRepartition.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.txt b/core/src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.txt rename to core/src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.txt diff --git a/src/main/scala/com/microsoft/ml/spark/stages/TextPreprocessor.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/TextPreprocessor.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/TextPreprocessor.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/TextPreprocessor.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Timer.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/Timer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Timer.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/Timer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/UDFTransformer.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/UDFTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/UDFTransformer.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/UDFTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/UnicodeNormalize.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/UnicodeNormalize.scala similarity index 97% rename from src/main/scala/com/microsoft/ml/spark/stages/UnicodeNormalize.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/UnicodeNormalize.scala index be12d2dcee8..889d1d85225 100644 --- a/src/main/scala/com/microsoft/ml/spark/stages/UnicodeNormalize.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/stages/UnicodeNormalize.scala @@ -1,79 +1,79 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.ml.spark.stages - -import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} -import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap} -import org.apache.spark.ml.util.Identifiable -import org.apache.spark.sql.{DataFrame, Dataset} -import org.apache.spark.sql.functions.udf - -import java.text.Normalizer -import com.microsoft.ml.spark.codegen.Wrappable -import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol} -import com.microsoft.ml.spark.logging.BasicLogging -import org.apache.spark.sql.types.{StringType, StructField, StructType} - -object UnicodeNormalize extends ComplexParamsReadable[UnicodeNormalize] - -/** UnicodeNormalize takes a dataframe and normalizes the unicode representation. - */ -class UnicodeNormalize(val uid: String) extends Transformer - with HasInputCol with HasOutputCol with Wrappable with ComplexParamsWritable with BasicLogging { - logClass() - - def this() = this(Identifiable.randomUID("UnicodeNormalize")) - - val form = new Param[String](this, "form", "Unicode normalization form: NFC, NFD, NFKC, NFKD") - - /** @group getParam */ - def getForm: String = get(form).getOrElse("NFKD") - - /** @group setParam */ - def setForm(value: String): this.type = { - // check input value - Normalizer.Form.valueOf(getForm) - - set("form", value) - } - - val lower = new BooleanParam(this, "lower", "Lowercase text") - - /** @group getParam */ - def getLower: Boolean = get(lower).getOrElse(true) - - /** @group setParam */ - def setLower(value: Boolean): this.type = set("lower", value) - - /** @param dataset - The input dataset, to be transformed - * @return The DataFrame that results from column selection - */ - override def transform(dataset: Dataset[_]): DataFrame = { - logTransform[DataFrame]({ - val inputIndex = dataset.columns.indexOf(getInputCol) - - require(inputIndex != -1, s"Input column $getInputCol does not exist") - - val normalizeFunc = (value: String) => - if (value == null) null - else Normalizer.normalize(value, Normalizer.Form.valueOf(getForm)) - - val f = if (getLower) - (value: String) => Option(value).map(s => normalizeFunc(s.toLowerCase)).orNull - else - normalizeFunc - - val textMapper = udf(f) - - dataset.withColumn(getOutputCol, textMapper(dataset(getInputCol)).as(getOutputCol)) - }) - } - - def transformSchema(schema: StructType): StructType = { - schema.add(StructField(getOutputCol, StringType)) - } - - def copy(extra: ParamMap): UnicodeNormalize = defaultCopy(extra) - -} +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.stages + +import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} +import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap} +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.udf + +import java.text.Normalizer +import com.microsoft.ml.spark.codegen.Wrappable +import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol} +import com.microsoft.ml.spark.logging.BasicLogging +import org.apache.spark.sql.types.{StringType, StructField, StructType} + +object UnicodeNormalize extends ComplexParamsReadable[UnicodeNormalize] + +/** UnicodeNormalize takes a dataframe and normalizes the unicode representation. + */ +class UnicodeNormalize(val uid: String) extends Transformer + with HasInputCol with HasOutputCol with Wrappable with ComplexParamsWritable with BasicLogging { + logClass() + + def this() = this(Identifiable.randomUID("UnicodeNormalize")) + + val form = new Param[String](this, "form", "Unicode normalization form: NFC, NFD, NFKC, NFKD") + + /** @group getParam */ + def getForm: String = get(form).getOrElse("NFKD") + + /** @group setParam */ + def setForm(value: String): this.type = { + // check input value + Normalizer.Form.valueOf(getForm) + + set("form", value) + } + + val lower = new BooleanParam(this, "lower", "Lowercase text") + + /** @group getParam */ + def getLower: Boolean = get(lower).getOrElse(true) + + /** @group setParam */ + def setLower(value: Boolean): this.type = set("lower", value) + + /** @param dataset - The input dataset, to be transformed + * @return The DataFrame that results from column selection + */ + override def transform(dataset: Dataset[_]): DataFrame = { + logTransform[DataFrame]({ + val inputIndex = dataset.columns.indexOf(getInputCol) + + require(inputIndex != -1, s"Input column $getInputCol does not exist") + + val normalizeFunc = (value: String) => + if (value == null) null + else Normalizer.normalize(value, Normalizer.Form.valueOf(getForm)) + + val f = if (getLower) + (value: String) => Option(value).map(s => normalizeFunc(s.toLowerCase)).orNull + else + normalizeFunc + + val textMapper = udf(f) + + dataset.withColumn(getOutputCol, textMapper(dataset(getInputCol)).as(getOutputCol)) + }) + } + + def transformSchema(schema: StructType): StructType = { + schema.add(StructField(getOutputCol, StringType)) + } + + def copy(extra: ParamMap): UnicodeNormalize = defaultCopy(extra) + +} diff --git a/src/main/scala/com/microsoft/ml/spark/stages/udfs.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/udfs.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/udfs.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/udfs.scala diff --git a/src/main/scala/com/microsoft/ml/spark/train/AutoTrainedModel.scala b/core/src/main/scala/com/microsoft/ml/spark/train/AutoTrainedModel.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/AutoTrainedModel.scala rename to core/src/main/scala/com/microsoft/ml/spark/train/AutoTrainedModel.scala diff --git a/src/main/scala/com/microsoft/ml/spark/train/AutoTrainer.scala b/core/src/main/scala/com/microsoft/ml/spark/train/AutoTrainer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/AutoTrainer.scala rename to core/src/main/scala/com/microsoft/ml/spark/train/AutoTrainer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.scala b/core/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.scala rename to core/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.scala diff --git a/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.txt b/core/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.txt rename to core/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.txt diff --git a/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.scala b/core/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.scala rename to core/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.scala diff --git a/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.txt b/core/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.txt rename to core/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.txt diff --git a/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.scala b/core/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.scala rename to core/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.scala diff --git a/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.txt b/core/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.txt rename to core/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.txt diff --git a/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.scala b/core/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.scala rename to core/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.scala diff --git a/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.txt b/core/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.txt rename to core/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.txt diff --git a/src/main/scala/org/apache/spark/lightgbm/BlockManagerUtils.scala b/core/src/main/scala/org/apache/spark/injections/BlockManagerUtils.scala similarity index 92% rename from src/main/scala/org/apache/spark/lightgbm/BlockManagerUtils.scala rename to core/src/main/scala/org/apache/spark/injections/BlockManagerUtils.scala index ee0ba74dd41..6d0564abb4b 100644 --- a/src/main/scala/org/apache/spark/lightgbm/BlockManagerUtils.scala +++ b/core/src/main/scala/org/apache/spark/injections/BlockManagerUtils.scala @@ -1,13 +1,14 @@ // Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. -package org.apache.spark.lightgbm +package org.apache.spark.injections import org.apache.spark.sql.Dataset import org.apache.spark.storage.BlockManager object BlockManagerUtils { /** Returns the block manager from the dataframe's spark context. + * * @param data The dataframe to get the block manager from. * @return The block manager. */ diff --git a/src/main/scala/org/apache/spark/injections/RegressionUtils.scala b/core/src/main/scala/org/apache/spark/injections/RegressionUtils.scala similarity index 100% rename from src/main/scala/org/apache/spark/injections/RegressionUtils.scala rename to core/src/main/scala/org/apache/spark/injections/RegressionUtils.scala diff --git a/src/main/scala/org/apache/spark/injections/SConf.scala b/core/src/main/scala/org/apache/spark/injections/SConf.scala similarity index 100% rename from src/main/scala/org/apache/spark/injections/SConf.scala rename to core/src/main/scala/org/apache/spark/injections/SConf.scala diff --git a/src/main/scala/org/apache/spark/injections/UDFUtils.scala b/core/src/main/scala/org/apache/spark/injections/UDFUtils.scala similarity index 100% rename from src/main/scala/org/apache/spark/injections/UDFUtils.scala rename to core/src/main/scala/org/apache/spark/injections/UDFUtils.scala diff --git a/src/main/scala/org/apache/spark/ml/ComplexParamsSerializer.scala b/core/src/main/scala/org/apache/spark/ml/ComplexParamsSerializer.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/ComplexParamsSerializer.scala rename to core/src/main/scala/org/apache/spark/ml/ComplexParamsSerializer.scala diff --git a/src/main/scala/org/apache/spark/ml/LimeNamespaceInjections.scala b/core/src/main/scala/org/apache/spark/ml/LimeNamespaceInjections.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/LimeNamespaceInjections.scala rename to core/src/main/scala/org/apache/spark/ml/LimeNamespaceInjections.scala diff --git a/src/main/scala/org/apache/spark/ml/NamespaceInjections.scala b/core/src/main/scala/org/apache/spark/ml/NamespaceInjections.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/NamespaceInjections.scala rename to core/src/main/scala/org/apache/spark/ml/NamespaceInjections.scala diff --git a/src/main/scala/org/apache/spark/ml/Ranker.scala b/core/src/main/scala/org/apache/spark/ml/Ranker.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/Ranker.scala rename to core/src/main/scala/org/apache/spark/ml/Ranker.scala diff --git a/src/main/scala/org/apache/spark/ml/RegressorUtils.scala b/core/src/main/scala/org/apache/spark/ml/RegressorUtils.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/RegressorUtils.scala rename to core/src/main/scala/org/apache/spark/ml/RegressorUtils.scala diff --git a/src/main/scala/org/apache/spark/ml/Serializer.scala b/core/src/main/scala/org/apache/spark/ml/Serializer.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/Serializer.scala rename to core/src/main/scala/org/apache/spark/ml/Serializer.scala diff --git a/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.scala b/core/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.scala rename to core/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.scala diff --git a/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.txt b/core/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.txt similarity index 100% rename from src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.txt rename to core/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.txt diff --git a/src/main/scala/org/apache/spark/ml/param/ArrayMapParam.scala b/core/src/main/scala/org/apache/spark/ml/param/ArrayMapParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/ArrayMapParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/ArrayMapParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/ArrayParamMapParam.scala b/core/src/main/scala/org/apache/spark/ml/param/ArrayParamMapParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/ArrayParamMapParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/ArrayParamMapParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/BallTreeParam.scala b/core/src/main/scala/org/apache/spark/ml/param/BallTreeParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/BallTreeParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/BallTreeParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/ByteArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/ByteArrayParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/ByteArrayParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/ByteArrayParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/DataFrameParam.scala b/core/src/main/scala/org/apache/spark/ml/param/DataFrameParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/DataFrameParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/DataFrameParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/DataTypeParam.scala b/core/src/main/scala/org/apache/spark/ml/param/DataTypeParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/DataTypeParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/DataTypeParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/EstimatorArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/EstimatorArrayParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/EstimatorArrayParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/EstimatorArrayParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/EstimatorParam.scala b/core/src/main/scala/org/apache/spark/ml/param/EstimatorParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/EstimatorParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/EstimatorParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/EvaluatorParam.scala b/core/src/main/scala/org/apache/spark/ml/param/EvaluatorParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/EvaluatorParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/EvaluatorParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/JsonEncodableParam.scala b/core/src/main/scala/org/apache/spark/ml/param/JsonEncodableParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/JsonEncodableParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/JsonEncodableParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/MapArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/MapArrayParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/MapArrayParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/MapArrayParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/MapParam.scala b/core/src/main/scala/org/apache/spark/ml/param/MapParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/MapParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/MapParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/ParamSpaceParam.scala b/core/src/main/scala/org/apache/spark/ml/param/ParamSpaceParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/ParamSpaceParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/ParamSpaceParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/PipelineStageParam.scala b/core/src/main/scala/org/apache/spark/ml/param/PipelineStageParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/PipelineStageParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/PipelineStageParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/PythonWrappableParam.scala b/core/src/main/scala/org/apache/spark/ml/param/PythonWrappableParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/PythonWrappableParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/PythonWrappableParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/RWrappableParam.scala b/core/src/main/scala/org/apache/spark/ml/param/RWrappableParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/RWrappableParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/RWrappableParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/TransformerArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/TransformerArrayParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/TransformerArrayParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/TransformerArrayParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/TransformerParam.scala b/core/src/main/scala/org/apache/spark/ml/param/TransformerParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/TransformerParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/TransformerParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/TypedArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/TypedArrayParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/TypedArrayParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/TypedArrayParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/UDFParam.scala b/core/src/main/scala/org/apache/spark/ml/param/UDFParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/UDFParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/UDFParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/UDPyFParam.scala b/core/src/main/scala/org/apache/spark/ml/param/UDPyFParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/UDPyFParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/UDPyFParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/UntypedArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/UntypedArrayParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/UntypedArrayParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/UntypedArrayParam.scala diff --git a/src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala b/core/src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala rename to core/src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala diff --git a/src/main/scala/org/apache/spark/ml/source/image/PatchedImageFileFormat.scala b/core/src/main/scala/org/apache/spark/ml/source/image/PatchedImageFileFormat.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/source/image/PatchedImageFileFormat.scala rename to core/src/main/scala/org/apache/spark/ml/source/image/PatchedImageFileFormat.scala diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/DistributedHTTPSource.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/DistributedHTTPSource.scala similarity index 100% rename from src/main/scala/org/apache/spark/sql/execution/streaming/DistributedHTTPSource.scala rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/DistributedHTTPSource.scala diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/HTTPSource.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/HTTPSource.scala similarity index 100% rename from src/main/scala/org/apache/spark/sql/execution/streaming/HTTPSource.scala rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/HTTPSource.scala diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/ServingUDFs.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/ServingUDFs.scala similarity index 100% rename from src/main/scala/org/apache/spark/sql/execution/streaming/ServingUDFs.scala rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/ServingUDFs.scala diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSinkV2.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSinkV2.scala similarity index 100% rename from src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSinkV2.scala rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSinkV2.scala diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSourceV2.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSourceV2.scala similarity index 99% rename from src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSourceV2.scala rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSourceV2.scala index ef61d8330df..05b373a5f2f 100644 --- a/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSourceV2.scala +++ b/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSourceV2.scala @@ -152,7 +152,7 @@ private[streaming] object DriverServiceUtils { override def handle(request: HttpExchange): Unit = { try { val info = Serialization.read[ServiceInfo]( - IOUtils.toString(request.getRequestBody)) + IOUtils.toString(request.getRequestBody, "UTF-8")) HTTPServerUtils.respond(request, HTTPResponseData( Array(), None, StatusLineData(null, 200, "Success"), diff --git a/src/main/scala/org/apache/spark/sql/types/injections/MetadataUtilities.scala b/core/src/main/scala/org/apache/spark/sql/types/injections/MetadataUtilities.scala similarity index 100% rename from src/main/scala/org/apache/spark/sql/types/injections/MetadataUtilities.scala rename to core/src/main/scala/org/apache/spark/sql/types/injections/MetadataUtilities.scala diff --git a/src/main/scala/org/apache/spark/sql/types/injections/OptimizedCKNNFitting.scala b/core/src/main/scala/org/apache/spark/sql/types/injections/OptimizedCKNNFitting.scala similarity index 100% rename from src/main/scala/org/apache/spark/sql/types/injections/OptimizedCKNNFitting.scala rename to core/src/main/scala/org/apache/spark/sql/types/injections/OptimizedCKNNFitting.scala diff --git a/src/test/R/testthat.R b/core/src/test/R/testthat.R similarity index 100% rename from src/test/R/testthat.R rename to core/src/test/R/testthat.R diff --git a/src/test/R/testthat/setup-spark.R b/core/src/test/R/testthat/setup-spark.R similarity index 100% rename from src/test/R/testthat/setup-spark.R rename to core/src/test/R/testthat/setup-spark.R diff --git a/src/test/R/testthat/test-basic.R b/core/src/test/R/testthat/test-basic.R similarity index 100% rename from src/test/R/testthat/test-basic.R rename to core/src/test/R/testthat/test-basic.R diff --git a/src/test/python/LICENSE.txt b/core/src/test/python/LICENSE.txt similarity index 100% rename from src/test/python/LICENSE.txt rename to core/src/test/python/LICENSE.txt diff --git a/src/test/python/MANIFEST.in b/core/src/test/python/MANIFEST.in similarity index 100% rename from src/test/python/MANIFEST.in rename to core/src/test/python/MANIFEST.in diff --git a/src/main/python/mmlspark/lightgbm/__init__.py b/core/src/test/python/__init__.py similarity index 100% rename from src/main/python/mmlspark/lightgbm/__init__.py rename to core/src/test/python/__init__.py diff --git a/src/main/python/mmlspark/nn/__init__.py b/core/src/test/python/mmlsparktest/__init__.py similarity index 100% rename from src/main/python/mmlspark/nn/__init__.py rename to core/src/test/python/mmlsparktest/__init__.py diff --git a/src/main/python/mmlspark/opencv/__init__.py b/core/src/test/python/mmlsparktest/cyber/__init__.py similarity index 100% rename from src/main/python/mmlspark/opencv/__init__.py rename to core/src/test/python/mmlsparktest/cyber/__init__.py diff --git a/src/main/python/mmlspark/plot/__init__.py b/core/src/test/python/mmlsparktest/cyber/anamoly/__init__.py similarity index 100% rename from src/main/python/mmlspark/plot/__init__.py rename to core/src/test/python/mmlsparktest/cyber/anamoly/__init__.py diff --git a/src/test/python/mmlsparktest/cyber/anamoly/test_collaborative_filtering.py b/core/src/test/python/mmlsparktest/cyber/anamoly/test_collaborative_filtering.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/anamoly/test_collaborative_filtering.py rename to core/src/test/python/mmlsparktest/cyber/anamoly/test_collaborative_filtering.py diff --git a/src/test/python/mmlsparktest/cyber/anamoly/test_complement_access.py b/core/src/test/python/mmlsparktest/cyber/anamoly/test_complement_access.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/anamoly/test_complement_access.py rename to core/src/test/python/mmlsparktest/cyber/anamoly/test_complement_access.py diff --git a/src/test/python/mmlsparktest/cyber/explain_tester.py b/core/src/test/python/mmlsparktest/cyber/explain_tester.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/explain_tester.py rename to core/src/test/python/mmlsparktest/cyber/explain_tester.py diff --git a/src/main/python/mmlspark/stages/__init__.py b/core/src/test/python/mmlsparktest/cyber/feature/__init__.py similarity index 100% rename from src/main/python/mmlspark/stages/__init__.py rename to core/src/test/python/mmlsparktest/cyber/feature/__init__.py diff --git a/src/test/python/mmlsparktest/cyber/feature/test_indexers.py b/core/src/test/python/mmlsparktest/cyber/feature/test_indexers.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/feature/test_indexers.py rename to core/src/test/python/mmlsparktest/cyber/feature/test_indexers.py diff --git a/src/test/python/mmlsparktest/cyber/feature/test_scalers.py b/core/src/test/python/mmlsparktest/cyber/feature/test_scalers.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/feature/test_scalers.py rename to core/src/test/python/mmlsparktest/cyber/feature/test_scalers.py diff --git a/src/main/python/mmlspark/vw/__init__.py b/core/src/test/python/mmlsparktest/cyber/utils/__init__.py similarity index 100% rename from src/main/python/mmlspark/vw/__init__.py rename to core/src/test/python/mmlsparktest/cyber/utils/__init__.py diff --git a/src/test/python/mmlsparktest/cyber/utils/test_spark_utils.py b/core/src/test/python/mmlsparktest/cyber/utils/test_spark_utils.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/utils/test_spark_utils.py rename to core/src/test/python/mmlsparktest/cyber/utils/test_spark_utils.py diff --git a/src/test/__init__.py b/core/src/test/python/mmlsparktest/nn/__init__.py similarity index 100% rename from src/test/__init__.py rename to core/src/test/python/mmlsparktest/nn/__init__.py diff --git a/src/test/python/mmlsparktest/nn/test_ball_tree.py b/core/src/test/python/mmlsparktest/nn/test_ball_tree.py similarity index 100% rename from src/test/python/mmlsparktest/nn/test_ball_tree.py rename to core/src/test/python/mmlsparktest/nn/test_ball_tree.py diff --git a/src/test/python/__init__.py b/core/src/test/python/mmlsparktest/recommendation/__init__.py similarity index 100% rename from src/test/python/__init__.py rename to core/src/test/python/mmlsparktest/recommendation/__init__.py diff --git a/src/test/python/mmlsparktest/recommendation/test_ranking.py b/core/src/test/python/mmlsparktest/recommendation/test_ranking.py similarity index 100% rename from src/test/python/mmlsparktest/recommendation/test_ranking.py rename to core/src/test/python/mmlsparktest/recommendation/test_ranking.py diff --git a/src/test/python/setup.py b/core/src/test/python/setup.py similarity index 100% rename from src/test/python/setup.py rename to core/src/test/python/setup.py diff --git a/core/src/test/resources/audio1.txt b/core/src/test/resources/audio1.txt new file mode 100644 index 00000000000..de9993a6af0 --- /dev/null +++ b/core/src/test/resources/audio1.txt @@ -0,0 +1 @@ +Content like data models tests and end points are organized into projects in the custom speech portal. Each project is specific to a domain and country slash language. For example, you may create a project for call centers that use English in the United States to create your first project select the speech to text slash custom speech, then click new project follow the instructions provided by The Wizard to create your project after you've created a project you should see 4 tabs data testing training. And deployment use the links provided in Next steps to learn how to use each tab. \ No newline at end of file diff --git a/src/test/resources/benchmarks/benchmarkBasicDataTypes.json b/core/src/test/resources/benchmarks/benchmarkBasicDataTypes.json similarity index 100% rename from src/test/resources/benchmarks/benchmarkBasicDataTypes.json rename to core/src/test/resources/benchmarks/benchmarkBasicDataTypes.json diff --git a/src/test/resources/benchmarks/benchmarkDate.json b/core/src/test/resources/benchmarks/benchmarkDate.json similarity index 100% rename from src/test/resources/benchmarks/benchmarkDate.json rename to core/src/test/resources/benchmarks/benchmarkDate.json diff --git a/src/test/resources/benchmarks/benchmarkNoOneHot.json b/core/src/test/resources/benchmarks/benchmarkNoOneHot.json similarity index 100% rename from src/test/resources/benchmarks/benchmarkNoOneHot.json rename to core/src/test/resources/benchmarks/benchmarkNoOneHot.json diff --git a/src/test/resources/benchmarks/benchmarkOneHot.json b/core/src/test/resources/benchmarks/benchmarkOneHot.json similarity index 100% rename from src/test/resources/benchmarks/benchmarkOneHot.json rename to core/src/test/resources/benchmarks/benchmarkOneHot.json diff --git a/src/test/resources/benchmarks/benchmarkString.json b/core/src/test/resources/benchmarks/benchmarkString.json similarity index 100% rename from src/test/resources/benchmarks/benchmarkString.json rename to core/src/test/resources/benchmarks/benchmarkString.json diff --git a/src/test/resources/benchmarks/benchmarkStringIndexOneHot.json b/core/src/test/resources/benchmarks/benchmarkStringIndexOneHot.json similarity index 100% rename from src/test/resources/benchmarks/benchmarkStringIndexOneHot.json rename to core/src/test/resources/benchmarks/benchmarkStringIndexOneHot.json diff --git a/src/test/resources/benchmarks/benchmarkStringMissing.json b/core/src/test/resources/benchmarks/benchmarkStringMissing.json similarity index 100% rename from src/test/resources/benchmarks/benchmarkStringMissing.json rename to core/src/test/resources/benchmarks/benchmarkStringMissing.json diff --git a/src/test/resources/benchmarks/benchmarkVectors.json b/core/src/test/resources/benchmarks/benchmarkVectors.json similarity index 100% rename from src/test/resources/benchmarks/benchmarkVectors.json rename to core/src/test/resources/benchmarks/benchmarkVectors.json diff --git a/src/test/resources/benchmarks/benchmarks_VerifyTrainClassifier.csv b/core/src/test/resources/benchmarks/benchmarks_VerifyTrainClassifier.csv similarity index 100% rename from src/test/resources/benchmarks/benchmarks_VerifyTrainClassifier.csv rename to core/src/test/resources/benchmarks/benchmarks_VerifyTrainClassifier.csv diff --git a/src/test/resources/benchmarks/benchmarks_VerifyTuneHyperparameters.csv b/core/src/test/resources/benchmarks/benchmarks_VerifyTuneHyperparameters.csv similarity index 100% rename from src/test/resources/benchmarks/benchmarks_VerifyTuneHyperparameters.csv rename to core/src/test/resources/benchmarks/benchmarks_VerifyTuneHyperparameters.csv diff --git a/src/test/resources/demoUsage.csv.gz b/core/src/test/resources/demoUsage.csv.gz similarity index 100% rename from src/test/resources/demoUsage.csv.gz rename to core/src/test/resources/demoUsage.csv.gz diff --git a/src/test/resources/greyhound.jpg b/core/src/test/resources/greyhound.jpg similarity index 100% rename from src/test/resources/greyhound.jpg rename to core/src/test/resources/greyhound.jpg diff --git a/src/test/resources/sim_count1.csv.gz b/core/src/test/resources/sim_count1.csv.gz similarity index 100% rename from src/test/resources/sim_count1.csv.gz rename to core/src/test/resources/sim_count1.csv.gz diff --git a/src/test/resources/sim_count3.csv.gz b/core/src/test/resources/sim_count3.csv.gz similarity index 100% rename from src/test/resources/sim_count3.csv.gz rename to core/src/test/resources/sim_count3.csv.gz diff --git a/src/test/resources/sim_jac1.csv.gz b/core/src/test/resources/sim_jac1.csv.gz similarity index 100% rename from src/test/resources/sim_jac1.csv.gz rename to core/src/test/resources/sim_jac1.csv.gz diff --git a/src/test/resources/sim_jac3.csv.gz b/core/src/test/resources/sim_jac3.csv.gz similarity index 100% rename from src/test/resources/sim_jac3.csv.gz rename to core/src/test/resources/sim_jac3.csv.gz diff --git a/src/test/resources/sim_lift1.csv.gz b/core/src/test/resources/sim_lift1.csv.gz similarity index 100% rename from src/test/resources/sim_lift1.csv.gz rename to core/src/test/resources/sim_lift1.csv.gz diff --git a/src/test/resources/sim_lift3.csv.gz b/core/src/test/resources/sim_lift3.csv.gz similarity index 100% rename from src/test/resources/sim_lift3.csv.gz rename to core/src/test/resources/sim_lift3.csv.gz diff --git a/src/test/resources/user_aff.csv.gz b/core/src/test/resources/user_aff.csv.gz similarity index 100% rename from src/test/resources/user_aff.csv.gz rename to core/src/test/resources/user_aff.csv.gz diff --git a/src/test/resources/userpred_count3_userid_only.csv.gz b/core/src/test/resources/userpred_count3_userid_only.csv.gz similarity index 100% rename from src/test/resources/userpred_count3_userid_only.csv.gz rename to core/src/test/resources/userpred_count3_userid_only.csv.gz diff --git a/src/test/resources/userpred_jac3_userid_only.csv.gz b/core/src/test/resources/userpred_jac3_userid_only.csv.gz similarity index 100% rename from src/test/resources/userpred_jac3_userid_only.csv.gz rename to core/src/test/resources/userpred_jac3_userid_only.csv.gz diff --git a/src/test/resources/userpred_lift3_userid_only.csv.gz b/core/src/test/resources/userpred_lift3_userid_only.csv.gz similarity index 100% rename from src/test/resources/userpred_lift3_userid_only.csv.gz rename to core/src/test/resources/userpred_lift3_userid_only.csv.gz diff --git a/src/test/scala/com/microsoft/ml/spark/Secrets.scala b/core/src/test/scala/com/microsoft/ml/spark/Secrets.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/Secrets.scala rename to core/src/test/scala/com/microsoft/ml/spark/Secrets.scala diff --git a/src/test/scala/com/microsoft/ml/spark/automl/VerifyFindBestModel.scala b/core/src/test/scala/com/microsoft/ml/spark/automl/VerifyFindBestModel.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/automl/VerifyFindBestModel.scala rename to core/src/test/scala/com/microsoft/ml/spark/automl/VerifyFindBestModel.scala diff --git a/src/test/scala/com/microsoft/ml/spark/automl/VerifyTuneHyperparameters.scala b/core/src/test/scala/com/microsoft/ml/spark/automl/VerifyTuneHyperparameters.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/automl/VerifyTuneHyperparameters.scala rename to core/src/test/scala/com/microsoft/ml/spark/automl/VerifyTuneHyperparameters.scala diff --git a/core/src/test/scala/com/microsoft/ml/spark/codegen/TestGen.scala b/core/src/test/scala/com/microsoft/ml/spark/codegen/TestGen.scala new file mode 100644 index 00000000000..c0a5b315014 --- /dev/null +++ b/core/src/test/scala/com/microsoft/ml/spark/codegen/TestGen.scala @@ -0,0 +1,87 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.codegen + +import java.io.File + +import com.microsoft.ml.spark.codegen.CodegenConfigProtocol._ +import com.microsoft.ml.spark.core.env.FileUtilities._ +import com.microsoft.ml.spark.core.test.base.TestBase +import com.microsoft.ml.spark.core.test.fuzzing.PyTestFuzzing +import com.microsoft.ml.spark.core.utils.JarLoadingUtils.instantiateServices +import org.apache.commons.io.FileUtils +import spray.json._ + + +object TestGen { + + import CodeGenUtils._ + + def generatePythonTests(conf: CodegenConfig): Unit = { + instantiateServices[PyTestFuzzing[_]](conf.jarName).foreach { ltc => + try { + ltc.makePyTestFile(conf) + } catch { + case _: NotImplementedError => + println(s"ERROR: Could not generate test for ${ltc.testClassName} because of Complex Parameters") + } + } + } + + private def makeInitFiles(conf: CodegenConfig, packageFolder: String = ""): Unit = { + val dir = new File(new File(conf.pyTestDir, "mmlsparktest"), packageFolder) + if (!dir.exists()){ + dir.mkdirs() + } + writeFile(new File(dir, "__init__.py"), "") + dir.listFiles().filter(_.isDirectory).foreach(f => + makeInitFiles(conf, packageFolder + "/" + f.getName) + ) + } + + + //noinspection ScalaStyle + def generatePyPackageData(conf: CodegenConfig): Unit = { + if (!conf.pySrcDir.exists()) { + conf.pySrcDir.mkdir() + } + writeFile(join(conf.pyTestDir,"mmlsparktest", "spark.py"), + s""" + |# Copyright (C) Microsoft Corporation. All rights reserved. + |# Licensed under the MIT License. See LICENSE in project root for information. + | + |from pyspark.sql import SparkSession, SQLContext + |import os + |import mmlspark + |from mmlspark.core import __spark_package_version__ + | + |spark = (SparkSession.builder + | .master("local[*]") + | .appName("PysparkTests") + | .config("spark.jars.packages", "com.microsoft.ml.spark:mmlspark:" + __spark_package_version__) + | .config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven") + | .config("spark.executor.heartbeatInterval", "60s") + | .config("spark.sql.shuffle.partitions", 10) + | .config("spark.sql.crossJoin.enabled", "true") + | .getOrCreate()) + | + |sc = SQLContext(spark.sparkContext) + | + |""".stripMargin) + } + + + def main(args: Array[String]): Unit = { + val conf = args.head.parseJson.convertTo[CodegenConfig] + clean(conf.testDataDir) + clean(conf.pyTestDir) + generatePythonTests(conf) + generatePyPackageData(conf) + //TestBase.stopSparkSession() + if (toDir(conf.pyTestOverrideDir).exists()){ + FileUtils.copyDirectoryToDirectory(toDir(conf.pyTestOverrideDir), toDir(conf.pyTestDir)) + } + makeInitFiles(conf) + } +} diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/HashingTFSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/HashingTFSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/ml/HashingTFSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/HashingTFSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/IDFSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/IDFSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/ml/IDFSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/IDFSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/NGramSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/NGramSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/ml/NGramSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/NGramSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/OneHotEncoderSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/OneHotEncoderSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/ml/OneHotEncoderSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/OneHotEncoderSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/Word2VecSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/Word2VecSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/ml/Word2VecSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/Word2VecSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/schema/SparkBindingsTest.scala b/core/src/test/scala/com/microsoft/ml/spark/core/schema/SparkBindingsTest.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/schema/SparkBindingsTest.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/schema/SparkBindingsTest.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/schema/TestCategoricals.scala b/core/src/test/scala/com/microsoft/ml/spark/core/schema/TestCategoricals.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/schema/TestCategoricals.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/schema/TestCategoricals.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/schema/VerifyFastVectorAssembler.scala b/core/src/test/scala/com/microsoft/ml/spark/core/schema/VerifyFastVectorAssembler.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/schema/VerifyFastVectorAssembler.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/schema/VerifyFastVectorAssembler.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/schema/VerifySparkSchema.scala b/core/src/test/scala/com/microsoft/ml/spark/core/schema/VerifySparkSchema.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/schema/VerifySparkSchema.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/schema/VerifySparkSchema.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/serialize/ValidateComplexParamSerializer.scala b/core/src/test/scala/com/microsoft/ml/spark/core/serialize/ValidateComplexParamSerializer.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/serialize/ValidateComplexParamSerializer.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/serialize/ValidateComplexParamSerializer.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala b/core/src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala similarity index 79% rename from src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala index faaf19398ea..031d1b333e4 100644 --- a/src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala @@ -31,23 +31,27 @@ object SparkSessionFactory { if (File.separator != "\\") path else path.replaceFirst("[A-Z]:", "").replace("\\", "/") } + def currentDir(): String = System.getProperty("user.dir") def getSession(name: String, logLevel: String = "WARN", numRetries: Int = 1, numCores: Option[Int] = None): SparkSession = { val cores = numCores.map(_.toString).getOrElse("*") val conf = new SparkConf() - .setAppName(name) - .setMaster(if (numRetries == 1){s"local[$cores]"}else{s"local[$cores, $numRetries]"}) - .set("spark.logConf", "true") - .set("spark.sql.shuffle.partitions", "20") - .set("spark.driver.maxResultSize", "6g") - .set("spark.sql.warehouse.dir", SparkSessionFactory.LocalWarehousePath) - .set("spark.sql.crossJoin.enabled", "true") + .setAppName(name) + .setMaster(if (numRetries == 1) { + s"local[$cores]" + } else { + s"local[$cores, $numRetries]" + }) + .set("spark.logConf", "true") + .set("spark.sql.shuffle.partitions", "20") + .set("spark.driver.maxResultSize", "6g") + .set("spark.sql.warehouse.dir", SparkSessionFactory.LocalWarehousePath) + .set("spark.sql.crossJoin.enabled", "true") val sess = SparkSession.builder() .config(conf) .getOrCreate() - sess.sparkContext.setLogLevel(logLevel) sess } diff --git a/src/test/scala/com/microsoft/ml/spark/core/test/base/TestBase.scala b/core/src/test/scala/com/microsoft/ml/spark/core/test/base/TestBase.scala similarity index 99% rename from src/test/scala/com/microsoft/ml/spark/core/test/base/TestBase.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/test/base/TestBase.scala index 097c120581b..84a2bfcd08f 100644 --- a/src/test/scala/com/microsoft/ml/spark/core/test/base/TestBase.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/core/test/base/TestBase.scala @@ -3,6 +3,8 @@ package com.microsoft.ml.spark.core.test.base +import java.nio.file.Files + import breeze.linalg.norm.Impl import breeze.linalg.{norm, DenseVector => BDV} import breeze.math.Field @@ -17,7 +19,6 @@ import org.scalatest._ import org.scalatest.concurrent.TimeLimits import org.scalatest.time.{Seconds, Span} -import java.nio.file.Files import scala.concurrent._ import scala.reflect.ClassTag diff --git a/src/test/scala/com/microsoft/ml/spark/core/test/benchmarks/Benchmarks.scala b/core/src/test/scala/com/microsoft/ml/spark/core/test/benchmarks/Benchmarks.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/test/benchmarks/Benchmarks.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/test/benchmarks/Benchmarks.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala b/core/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala similarity index 90% rename from src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala index 9adbad67236..7c6540c8861 100644 --- a/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala @@ -7,15 +7,15 @@ import java.io.File import java.nio.charset.StandardCharsets import java.nio.file.Files -import com.microsoft.ml.spark.codegen.Config +import com.microsoft.ml.spark.codegen.CodegenConfig import com.microsoft.ml.spark.core.env.FileUtilities -import com.microsoft.ml.spark.core.test.base.TestBase import org.apache.commons.io.FileUtils import org.apache.spark.ml._ import org.apache.spark.ml.param.{DataFrameEquality, ExternalPythonWrappableParam, ParamPair} import org.apache.spark.ml.util.{MLReadable, MLWritable} import org.apache.spark.sql.DataFrame import com.microsoft.ml.spark.codegen.GenerationUtils._ +import com.microsoft.ml.spark.core.test.base.TestBase /** * Class for holding test information, call by name to avoid uneccesary computations in test generations @@ -50,17 +50,17 @@ trait PyTestFuzzing[S <: PipelineStage] extends TestBase with DataFrameEquality val testClassName: String = this.getClass.getName.split(".".toCharArray).last - val testDataDir: File = FileUtilities.join( - Config.TestDataDir, this.getClass.getName.split(".".toCharArray).last) + def testDataDir(conf: CodegenConfig): File = FileUtilities.join( + conf.testDataDir, this.getClass.getName.split(".".toCharArray).last) - def saveDataset(df: DataFrame, name: String): Unit = { - df.write.mode("overwrite").parquet(new File(testDataDir, s"$name.parquet").toString) + def saveDataset(conf: CodegenConfig, df: DataFrame, name: String): Unit = { + df.write.mode("overwrite").parquet(new File(testDataDir(conf), s"$name.parquet").toString) } - def saveModel(model: S, name: String): Unit = { + def saveModel(conf: CodegenConfig, model: S, name: String): Unit = { model match { case writable: MLWritable => - writable.write.overwrite().save(new File(testDataDir, s"$name.model").toString) + writable.write.overwrite().save(new File(testDataDir(conf), s"$name.model").toString) case _ => throw new IllegalArgumentException(s"${model.getClass.getName} is not writable") } @@ -69,14 +69,14 @@ trait PyTestFuzzing[S <: PipelineStage] extends TestBase with DataFrameEquality val testFitting = false - def saveTestData(): Unit = { - testDataDir.mkdirs() + def saveTestData(conf: CodegenConfig): Unit = { + testDataDir(conf).mkdirs() pyTestObjects().zipWithIndex.foreach { case (to, i) => - saveModel(to.stage, s"model-$i") + saveModel(conf, to.stage, s"model-$i") if (testFitting) { - saveDataset(to.fitDF, s"fit-$i") - saveDataset(to.transDF, s"trans-$i") - to.validateDF.foreach(saveDataset(_, s"val-$i")) + saveDataset(conf, to.fitDF, s"fit-$i") + saveDataset(conf, to.transDF, s"trans-$i") + to.validateDF.foreach(saveDataset(conf, _, s"val-$i")) } } } @@ -144,9 +144,9 @@ trait PyTestFuzzing[S <: PipelineStage] extends TestBase with DataFrameEquality } - def makePyTestFile(): Unit = { + def makePyTestFile(conf: CodegenConfig): Unit = { spark - saveTestData() + saveTestData(conf) val generatedTests = pyTestObjects().zipWithIndex.map { case (to, i) => makePyTests(to, i) } val stage = pyTestObjects().head.stage val stageName = stage.getClass.getName.split(".".toCharArray).last @@ -159,7 +159,7 @@ trait PyTestFuzzing[S <: PipelineStage] extends TestBase with DataFrameEquality |from os.path import join |import json | - |test_data_dir = "${testDataDir.toString.replaceAllLiterally("\\", "\\\\")}" + |test_data_dir = "${testDataDir(conf).toString.replaceAllLiterally("\\", "\\\\")}" | | |class $testClassName(unittest.TestCase): @@ -180,7 +180,7 @@ trait PyTestFuzzing[S <: PipelineStage] extends TestBase with DataFrameEquality val testFolders = importPath.mkString(".") .replaceAllLiterally("com.microsoft.ml.spark", "mmlsparktest").split(".".toCharArray) - val testDir = FileUtilities.join((Seq(Config.PyTestDir.toString) ++ testFolders.toSeq): _*) + val testDir = FileUtilities.join((Seq(conf.pyTestDir.toString) ++ testFolders.toSeq): _*) testDir.mkdirs() Files.write( FileUtilities.join(testDir, "test_" + camelToSnake(testClassName) + ".py").toPath, diff --git a/src/test/scala/com/microsoft/ml/spark/core/utils/VerifyClusterUtil.scala b/core/src/test/scala/com/microsoft/ml/spark/core/utils/VerifyClusterUtil.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/utils/VerifyClusterUtil.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/utils/VerifyClusterUtil.scala diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/split1/HasExplainTargetSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/HasExplainTargetSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/explainers/split1/HasExplainTargetSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/explainers/split1/HasExplainTargetSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/split1/KernelSHAPSamplerSupportSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/KernelSHAPSamplerSupportSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/explainers/split1/KernelSHAPSamplerSupportSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/explainers/split1/KernelSHAPSamplerSupportSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/split1/LassoRegressionSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/LassoRegressionSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/explainers/split1/LassoRegressionSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/explainers/split1/LassoRegressionSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/split1/LeastSquaresRegressionSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/LeastSquaresRegressionSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/explainers/split1/LeastSquaresRegressionSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/explainers/split1/LeastSquaresRegressionSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/split1/SamplerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/SamplerSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/explainers/split1/SamplerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/explainers/split1/SamplerSuite.scala index 4606bedcf70..0c4ea711ed0 100644 --- a/src/test/scala/com/microsoft/ml/spark/explainers/split1/SamplerSuite.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/SamplerSuite.scala @@ -6,7 +6,6 @@ package com.microsoft.ml.spark.explainers.split1 import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} import breeze.stats.distributions.RandBasis import breeze.stats.{mean, stddev} -import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.explainers.BreezeUtils._ import com.microsoft.ml.spark.explainers._ import com.microsoft.ml.spark.io.image.ImageUtils @@ -17,8 +16,9 @@ import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types._ import org.scalactic.{Equality, TolerantNumerics} import org.scalatest.Matchers._ - import java.nio.file.{Files, Paths} + +import com.microsoft.ml.spark.core.test.base.TestBase import javax.imageio.ImageIO class SamplerSuite extends TestBase { diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/split1/TabularLIMEExplainerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/TabularLIMEExplainerSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/explainers/split1/TabularLIMEExplainerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/explainers/split1/TabularLIMEExplainerSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/split1/TabularSHAPExplainerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/TabularSHAPExplainerSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/explainers/split1/TabularSHAPExplainerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/explainers/split1/TabularSHAPExplainerSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/split1/TextExplainersSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/TextExplainersSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/explainers/split1/TextExplainersSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/explainers/split1/TextExplainersSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/split1/VectorLIMEExplainerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/VectorLIMEExplainerSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/explainers/split1/VectorLIMEExplainerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/explainers/split1/VectorLIMEExplainerSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/split1/VectorSHAPExplainerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/VectorSHAPExplainerSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/explainers/split1/VectorSHAPExplainerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/explainers/split1/VectorSHAPExplainerSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCleanMissingData.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCleanMissingData.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyCleanMissingData.scala rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCleanMissingData.scala diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCountSelector.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCountSelector.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyCountSelector.scala rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCountSelector.scala diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyDataConversion.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyDataConversion.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyDataConversion.scala rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyDataConversion.scala diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyFeaturize.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyFeaturize.scala similarity index 99% rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyFeaturize.scala rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyFeaturize.scala index 7ff30e5c723..72168f2badc 100644 --- a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyFeaturize.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyFeaturize.scala @@ -13,7 +13,7 @@ import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject} import org.apache.commons.io.FileUtils import org.apache.spark.ml.PipelineModel import org.apache.spark.ml.feature.StringIndexer -import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vectors, Vector} +import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql._ diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyValueIndexer.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyValueIndexer.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyValueIndexer.scala rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyValueIndexer.scala diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/text/MultiNGramSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/text/MultiNGramSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/featurize/text/MultiNGramSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/featurize/text/MultiNGramSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/text/PageSplitterSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/text/PageSplitterSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/featurize/text/PageSplitterSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/featurize/text/PageSplitterSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizerSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizerSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizerSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizerSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/flaky/PartitionConsolidatorSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/flaky/PartitionConsolidatorSuite.scala similarity index 93% rename from src/test/scala/com/microsoft/ml/spark/flaky/PartitionConsolidatorSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/flaky/PartitionConsolidatorSuite.scala index 197f85a6fb5..9c014e715a8 100644 --- a/src/test/scala/com/microsoft/ml/spark/flaky/PartitionConsolidatorSuite.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/flaky/PartitionConsolidatorSuite.scala @@ -3,13 +3,13 @@ package com.microsoft.ml.spark.flaky -import com.microsoft.ml.spark.core.test.base.{SparkSessionFactory, TestBase, TimeLimitedFlaky} +import com.microsoft.ml.spark.core.test.base.{TestBase, TimeLimitedFlaky} import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import com.microsoft.ml.spark.stages.PartitionConsolidator import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.types.{DoubleType, StructType} -import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} +import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.scalatest.Assertion class PartitionConsolidatorSuite extends TransformerFuzzing[PartitionConsolidator] with TimeLimitedFlaky { diff --git a/core/src/test/scala/com/microsoft/ml/spark/image/ImageTestUtils.scala b/core/src/test/scala/com/microsoft/ml/spark/image/ImageTestUtils.scala new file mode 100644 index 00000000000..63dbea62576 --- /dev/null +++ b/core/src/test/scala/com/microsoft/ml/spark/image/ImageTestUtils.scala @@ -0,0 +1,109 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.image + +import java.io.File +import java.net.URL + +import com.microsoft.ml.spark.build.BuildInfo +import com.microsoft.ml.spark.core.env.FileUtilities +import com.microsoft.ml.spark.core.test.base.TestBase +import org.apache.spark.ml.linalg.DenseVector +import org.apache.spark.sql.{DataFrame, SparkSession} +import com.microsoft.ml.spark.io.IOImplicits.dfrToDfre +import org.apache.commons.io.FileUtils +import org.apache.spark.sql.functions.col + +trait ImageTestUtils extends TestBase { + + val filesRoot = BuildInfo.datasetDir.toString + val imagePath = FileUtilities.join(filesRoot, "Images", "CIFAR").toString + val modelPath = FileUtilities.join(filesRoot, "CNTKModel", "ConvNet_CIFAR10.model").toString + val inputCol = "cntk_images" + val outputCol = "out" + val labelCol = "labels" + + val featureVectorLength = 3 * 32 * 32 + lazy val saveFile = new File(tmpDir.toFile, "spark-z.model").toString + + def testModelDF(spark: SparkSession): DataFrame = { + import spark.implicits._ + spark.sparkContext.parallelize(Seq( + Array(1.32165250, -2.1215112, 0.63150704, 0.77315974, -1.28163720, + -0.20210080, -2.2839167, -2.08691480, 5.08418200, -1.33741090), + Array(3.44079640, 1.4877119, -0.74059330, -0.34381202, -2.48724990, + -2.62866950, -3.1693816, -3.14182600, 4.76314800, 0.68712880), + Array(-1.88747900, -4.7685330, 0.15169683, 6.80547570, -0.38405967, + 3.41065170, 1.3302778, -0.87714905, -2.18046050, -4.16661830), + Array(5.01010300, 3.9860306, -1.36795600, -0.89830830, -4.49545430, + -4.19537070, -4.4045380, -5.81759450, 6.93805700, 1.49001510), + Array(-4.70754600, -6.0414960, 1.20658250, 5.40738300, 1.07661690, + 4.71566440, 4.3834330, -1.57187440, -2.96569730, -5.43208270), + Array(-1.23873880, -3.2042341, 2.54533000, 5.51954800, 2.89042470, + 0.12380804, 3.8639085, -4.79466800, -2.41463420, -5.17418430))).toDF + } + + def testImages(spark: SparkSession): DataFrame = { + val images = spark.read.image.load(imagePath) + + val unroll = new UnrollImage().setInputCol("image").setOutputCol(inputCol) + + unroll.transform(images).select(inputCol) + } + + def makeFakeData(spark: SparkSession, rows: Int, size: Int, outputDouble: Boolean = false): DataFrame = { + import spark.implicits._ + if (outputDouble) { + List + .fill(rows)(List.fill(size)(0.0).toArray) + .zip(List.fill(rows)(0.0)) + .toDF(inputCol, labelCol) + } else { + List + .fill(rows)(List.fill(size)(0.0.toFloat).toArray) + .zip(List.fill(rows)(0.0)) + .toDF(inputCol, labelCol) + } + } + + protected def compareToTestModel(result: DataFrame) = { + //TODO improve checks + assert(result.columns.toSet == Set(inputCol, outputCol)) + assert(result.count() == testModelDF(result.sparkSession).count()) + val max = result + .select(outputCol) + .collect() + .map(row => row.getAs[DenseVector](0).toArray.max) + .max + assert(max < 10 & max > -10) + } + + lazy val images: DataFrame = spark.read.image.load(imagePath) + .withColumnRenamed("image", inputCol) + lazy val binaryImages: DataFrame = spark.read.binary.load(imagePath) + .select(col("value.bytes").alias(inputCol)) + + lazy val groceriesPath = FileUtilities.join(BuildInfo.datasetDir, "Images","Grocery") + lazy val groceryImages: DataFrame = spark.read.image + .option("dropInvalid", true) + .load(groceriesPath + "**") + .withColumnRenamed("image", inputCol) + + lazy val greyscaleImageLocation: String = { + val loc = "/tmp/greyscale.jpg" + val f = new File(loc) + if (f.exists()) {f.delete()} + FileUtils.copyURLToFile(new URL("https://mmlspark.blob.core.windows.net/datasets/LIME/greyscale.jpg"), f) + loc + } + + lazy val greyscaleImage: DataFrame = spark + .read.image.load(greyscaleImageLocation) + .select(col("image").alias(inputCol)) + + lazy val greyscaleBinary: DataFrame = spark + .read.binary.load(greyscaleImageLocation) + .select(col("value.bytes").alias(inputCol)) + +} diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/BinaryFileReaderSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/BinaryFileReaderSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split1/BinaryFileReaderSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/BinaryFileReaderSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/HTTPTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/HTTPTransformerSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split1/HTTPTransformerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/HTTPTransformerSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala similarity index 99% rename from src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala index 13592cec90b..b611ef5158e 100644 --- a/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala @@ -5,7 +5,7 @@ package com.microsoft.ml.spark.io.split1 import java.io.{File, FileInputStream} -import com.microsoft.ml.spark.cognitive.OsUtils +import com.microsoft.ml.spark.core.utils.OsUtils import com.microsoft.ml.spark.core.env.FileUtilities import com.microsoft.ml.spark.core.schema.ImageSchemaUtils import com.microsoft.ml.spark.core.test.base.TestBase diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/ParserSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/ParserSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split1/ParserSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/ParserSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/PowerBiSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/PowerBiSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split1/PowerBiSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/PowerBiSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/SimpleHTTPTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/SimpleHTTPTransformerSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split1/SimpleHTTPTransformerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/SimpleHTTPTransformerSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/io/split2/ContinuousHTTPSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split2/ContinuousHTTPSuite.scala similarity index 97% rename from src/test/scala/com/microsoft/ml/spark/io/split2/ContinuousHTTPSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split2/ContinuousHTTPSuite.scala index 5507196ee7b..40cf3936191 100644 --- a/src/test/scala/com/microsoft/ml/spark/io/split2/ContinuousHTTPSuite.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/io/split2/ContinuousHTTPSuite.scala @@ -5,7 +5,6 @@ package com.microsoft.ml.spark.io.split2 import java.io.File import java.util.UUID -import java.util.concurrent.TimeUnit import com.microsoft.ml.spark.core.test.base.{Flaky, TestBase} import com.microsoft.ml.spark.io.IOImplicits._ @@ -15,7 +14,6 @@ import org.apache.spark.sql.streaming.{DataStreamReader, StreamingQuery, Trigger import org.apache.spark.sql.types.BinaryType import scala.concurrent.Await -import scala.concurrent.duration.Duration // scalastyle:off magic.number class ContinuousHTTPSuite extends TestBase with Flaky with HTTPTestUtils { diff --git a/src/test/scala/com/microsoft/ml/spark/io/split2/DistributedHTTPSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split2/DistributedHTTPSuite.scala similarity index 99% rename from src/test/scala/com/microsoft/ml/spark/io/split2/DistributedHTTPSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split2/DistributedHTTPSuite.scala index 5dd5b437408..d5d106315b8 100644 --- a/src/test/scala/com/microsoft/ml/spark/io/split2/DistributedHTTPSuite.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/io/split2/DistributedHTTPSuite.scala @@ -354,7 +354,7 @@ class DistributedHTTPSuite extends TestBase with Flaky with HTTPTestUtils { processes.foreach { p => p.waitFor - val error = IOUtils.toString(p.getErrorStream) + val error = IOUtils.toString(p.getErrorStream, "UTF-8") assert(error === "") } } diff --git a/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split2/HTTPSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPv2Suite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPv2Suite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split2/HTTPv2Suite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPv2Suite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala b/core/src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala similarity index 98% rename from src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala rename to core/src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala index e623605967e..2ee5fd153e2 100644 --- a/src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala @@ -86,10 +86,6 @@ class VerifyIsolationForest extends Benchmarks with EstimatorFuzzing[IsolationFo data } - test("foo"){ - new IsolationForest().makePyFile() - } - override def reader: MLReadable[_] = IsolationForest override def modelReader: MLReadable[_] = IsolationForestModel diff --git a/core/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala b/core/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala new file mode 100644 index 00000000000..b58e597944b --- /dev/null +++ b/core/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala @@ -0,0 +1,66 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.lime + +import breeze.linalg.{*, DenseMatrix} +import breeze.stats.distributions.Rand +import com.microsoft.ml.spark.core.test.base.TestBase +import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject, TransformerFuzzing} +import org.apache.spark.ml.linalg.DenseVector +import org.apache.spark.ml.param.DataFrameEquality +import org.apache.spark.ml.regression.LinearRegression +import org.apache.spark.ml.util.MLReadable + +trait LimeTestBase extends TestBase { + + import spark.implicits._ + + lazy val nRows = 100 + lazy val d1 = 3 + lazy val d2 = 1 + + lazy val m: DenseMatrix[Double] = new DenseMatrix(d1, d2, Array(1.0, -1.0, 2.0)) + lazy val x: DenseMatrix[Double] = DenseMatrix.rand(nRows, d1, Rand.gaussian) + lazy val noise: DenseMatrix[Double] = DenseMatrix.rand(nRows, d2, Rand.gaussian) * 0.1 + lazy val y = x * m //+ noise + + lazy val xRows = x(*, ::).iterator.toSeq.map(dv => new DenseVector(dv.toArray)) + lazy val yRows = y(*, ::).iterator.toSeq.map(dv => dv(0)) + lazy val df = xRows.zip(yRows).toDF("features", "label") + + lazy val model = new LinearRegression().fit(df) + + lazy val lime = new TabularLIME() + .setModel(model) + .setInputCol("features") + .setPredictionCol(model.getPredictionCol) + .setOutputCol("out") + .setNSamples(1000) + + lazy val limeModel = lime.fit(df) +} + +class TabularLIMESuite extends EstimatorFuzzing[TabularLIME] with + DataFrameEquality with LimeTestBase { + + test("text lime usage test check") { + val results = limeModel.transform(df).select("out") + .collect().map(_.getAs[DenseVector](0)) + results.foreach(result => assert(result === new DenseVector(m.data))) + } + + override def testObjects(): Seq[TestObject[TabularLIME]] = Seq(new TestObject(lime, df)) + + override def reader: MLReadable[_] = TabularLIME + + override def modelReader: MLReadable[_] = TabularLIMEModel +} + +class TabularLIMEModelSuite extends TransformerFuzzing[TabularLIMEModel] with + DataFrameEquality with LimeTestBase { + + override def testObjects(): Seq[TestObject[TabularLIMEModel]] = Seq(new TestObject(limeModel, df)) + + override def reader: MLReadable[_] = TabularLIMEModel +} diff --git a/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala similarity index 96% rename from src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala index 5d2c26e330f..289720f9691 100644 --- a/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala @@ -7,13 +7,13 @@ import java.awt.Color import java.awt.image.BufferedImage import java.io.File -import com.microsoft.ml.spark.cntk.CNTKTestUtils +import com.microsoft.ml.spark.image.ImageTestUtils import com.microsoft.ml.spark.io.image.ImageUtils import javax.imageio.ImageIO import scala.util.Random -class SuperpixelSuite extends CNTKTestUtils { +class SuperpixelSuite extends ImageTestUtils { lazy val sp1 = new Superpixel(img, 16, 130) lazy val sp2 = new Superpixel(img2, 100, 130) diff --git a/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala similarity index 90% rename from src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala index 881aefed41a..0c4a5b78d0b 100644 --- a/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala @@ -4,12 +4,12 @@ package com.microsoft.ml.spark.lime import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} -import com.microsoft.ml.spark.image.NetworkUtils +import com.microsoft.ml.spark.image.ImageTestUtils import com.microsoft.ml.spark.io.split1.FileReaderUtils import org.apache.spark.ml.util.MLReadable class SuperpixelTransformerSuite extends TransformerFuzzing[SuperpixelTransformer] - with NetworkUtils with FileReaderUtils { + with ImageTestUtils with FileReaderUtils { lazy val spt: SuperpixelTransformer = new SuperpixelTransformer().setInputCol(inputCol) test("basic functionality"){ diff --git a/src/test/scala/com/microsoft/ml/spark/lime/TextLIMESuite.scala b/core/src/test/scala/com/microsoft/ml/spark/lime/TextLIMESuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/lime/TextLIMESuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/lime/TextLIMESuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala b/core/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala similarity index 96% rename from src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala rename to core/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala index 7ce9ba7e569..bda6857db7c 100644 --- a/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala @@ -42,9 +42,7 @@ object DatabricksUtilities extends HasHttpClient { val Folder = s"/MMLSparkBuild/build_${BuildInfo.version}" // MMLSpark info - val TruncatedScalaVersion: String = BuildInfo.scalaVersion - .split(".".toCharArray.head).dropRight(1).mkString(".") - val Version = s"com.microsoft.ml.spark:${BuildInfo.name}_$TruncatedScalaVersion:${BuildInfo.version}" + val Version = s"com.microsoft.ml.spark:mmlspark:${BuildInfo.version}" val Repository = "https://mmlspark.azureedge.net/maven" val Libraries: String = List( @@ -59,7 +57,7 @@ object DatabricksUtilities extends HasHttpClient { val TimeoutInMillis: Int = 40 * 60 * 1000 val NotebookFiles: Array[File] = Option( - FileUtilities.join(BuildInfo.baseDirectory, "notebooks", "samples").getCanonicalFile.listFiles() + FileUtilities.join(BuildInfo.baseDirectory.getParent, "notebooks").getCanonicalFile.listFiles() ).get val ParallizableNotebooks = NotebookFiles.filterNot(_.getName.contains("Vowpal")) @@ -88,7 +86,7 @@ object DatabricksUtilities extends HasHttpClient { if (response.getStatusLine.getStatusCode != 200) { throw new RuntimeException(s"Failed: response: $response") } - IOUtils.toString(response.getEntity.getContent).parseJson + IOUtils.toString(response.getEntity.getContent, "UTF-8").parseJson }.get }) } @@ -104,7 +102,7 @@ object DatabricksUtilities extends HasHttpClient { val entity = IOUtils.toString(response.getEntity.getContent, "UTF-8") throw new RuntimeException(s"Failed:\n entity:$entity \n response: $response") } - IOUtils.toString(response.getEntity.getContent).parseJson + IOUtils.toString(response.getEntity.getContent, "UTF-8").parseJson }.get }) } diff --git a/src/test/scala/com/microsoft/ml/spark/nbtest/NotebookTests.scala b/core/src/test/scala/com/microsoft/ml/spark/nbtest/NotebookTests.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/nbtest/NotebookTests.scala rename to core/src/test/scala/com/microsoft/ml/spark/nbtest/NotebookTests.scala diff --git a/src/test/scala/com/microsoft/ml/spark/nbtest/SprayUtilities.scala b/core/src/test/scala/com/microsoft/ml/spark/nbtest/SprayUtilities.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/nbtest/SprayUtilities.scala rename to core/src/test/scala/com/microsoft/ml/spark/nbtest/SprayUtilities.scala diff --git a/src/test/scala/com/microsoft/ml/spark/nn/BallTreeTest.scala b/core/src/test/scala/com/microsoft/ml/spark/nn/BallTreeTest.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/nn/BallTreeTest.scala rename to core/src/test/scala/com/microsoft/ml/spark/nn/BallTreeTest.scala diff --git a/src/test/scala/com/microsoft/ml/spark/nn/ConditionalBallTreeTest.scala b/core/src/test/scala/com/microsoft/ml/spark/nn/ConditionalBallTreeTest.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/nn/ConditionalBallTreeTest.scala rename to core/src/test/scala/com/microsoft/ml/spark/nn/ConditionalBallTreeTest.scala diff --git a/src/test/scala/com/microsoft/ml/spark/nn/KNNTest.scala b/core/src/test/scala/com/microsoft/ml/spark/nn/KNNTest.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/nn/KNNTest.scala rename to core/src/test/scala/com/microsoft/ml/spark/nn/KNNTest.scala diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RankingAdapterSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingAdapterSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/recommendation/RankingAdapterSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingAdapterSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RankingEvaluatorSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingEvaluatorSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/recommendation/RankingEvaluatorSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingEvaluatorSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTestBase.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTestBase.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/recommendation/RankingTestBase.scala rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTestBase.scala diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexerSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexerSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexerSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexerSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/SARSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/SARSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/recommendation/SARSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/SARSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/BatchIteratorSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/BatchIteratorSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/BatchIteratorSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/BatchIteratorSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/CacherSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/CacherSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/CacherSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/CacherSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/ClassBalancerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/ClassBalancerSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/ClassBalancerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/ClassBalancerSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala similarity index 92% rename from src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala index 1507d152500..c96764cfd29 100644 --- a/src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala @@ -3,9 +3,8 @@ package com.microsoft.ml.spark.stages -import com.microsoft.ml.spark.codegen.Config import com.microsoft.ml.spark.core.test.base.TestBase -import com.microsoft.ml.spark.core.test.fuzzing.{PyTestFuzzing, TestObject, TransformerFuzzing} +import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.ml.util.MLReadable class DropColumnsSuite extends TestBase with TransformerFuzzing[DropColumns] { diff --git a/src/test/scala/com/microsoft/ml/spark/stages/EnsembleByKeySuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/EnsembleByKeySuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/EnsembleByKeySuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/EnsembleByKeySuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/ExplodeSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/ExplodeSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/ExplodeSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/ExplodeSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/LambdaSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/LambdaSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/LambdaSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/LambdaSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/MiniBatchTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/MiniBatchTransformerSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/MiniBatchTransformerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/MiniBatchTransformerSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/MultiColumnAdapterSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/MultiColumnAdapterSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/MultiColumnAdapterSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/MultiColumnAdapterSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/RenameColumnSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/RenameColumnSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/RenameColumnSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/RenameColumnSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/RepartitionSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/RepartitionSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/RepartitionSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/RepartitionSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/SelectColumnsSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/SelectColumnsSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/SelectColumnsSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/SelectColumnsSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/StratifiedRepartitionSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/StratifiedRepartitionSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/StratifiedRepartitionSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/StratifiedRepartitionSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/SummarizeDataSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/SummarizeDataSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/SummarizeDataSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/SummarizeDataSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/TextPreprocessorSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/TextPreprocessorSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/TextPreprocessorSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/TextPreprocessorSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/TimerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/TimerSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/TimerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/TimerSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/UDFSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/UDFSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/UDFSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/UDFSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/UDFTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/UDFTransformerSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/UDFTransformerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/UDFTransformerSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/UnicodeNormalizeSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/UnicodeNormalizeSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/UnicodeNormalizeSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/UnicodeNormalizeSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/train/VerifyComputeModelStatistics.scala b/core/src/test/scala/com/microsoft/ml/spark/train/VerifyComputeModelStatistics.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/train/VerifyComputeModelStatistics.scala rename to core/src/test/scala/com/microsoft/ml/spark/train/VerifyComputeModelStatistics.scala diff --git a/src/test/scala/com/microsoft/ml/spark/train/VerifyComputePerInstanceStatistics.scala b/core/src/test/scala/com/microsoft/ml/spark/train/VerifyComputePerInstanceStatistics.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/train/VerifyComputePerInstanceStatistics.scala rename to core/src/test/scala/com/microsoft/ml/spark/train/VerifyComputePerInstanceStatistics.scala diff --git a/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainClassifier.scala b/core/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainClassifier.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/train/VerifyTrainClassifier.scala rename to core/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainClassifier.scala index 959486a9093..387eb04e375 100644 --- a/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainClassifier.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainClassifier.scala @@ -6,7 +6,6 @@ package com.microsoft.ml.spark.train import java.io.File import com.microsoft.ml.spark.core.schema.SchemaConstants -import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.core.test.benchmarks.Benchmarks import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject} import com.microsoft.ml.spark.featurize.ValueIndexer @@ -18,6 +17,7 @@ import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, Multiclas import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Row} import com.microsoft.ml.spark.codegen.GenerationUtils +import com.microsoft.ml.spark.core.test.base.TestBase object ClassifierTestUtils { diff --git a/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainRegressor.scala b/core/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainRegressor.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/train/VerifyTrainRegressor.scala rename to core/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainRegressor.scala diff --git a/src/main/R/model_downloader.R b/deep-learning/src/main/R/model_downloader.R similarity index 100% rename from src/main/R/model_downloader.R rename to deep-learning/src/main/R/model_downloader.R diff --git a/src/main/python/mmlspark/cntk/CNTKModel.py b/deep-learning/src/main/python/mmlspark/cntk/CNTKModel.py similarity index 100% rename from src/main/python/mmlspark/cntk/CNTKModel.py rename to deep-learning/src/main/python/mmlspark/cntk/CNTKModel.py diff --git a/src/main/python/mmlspark/image/ImageFeaturizer.py b/deep-learning/src/main/python/mmlspark/cntk/ImageFeaturizer.py similarity index 94% rename from src/main/python/mmlspark/image/ImageFeaturizer.py rename to deep-learning/src/main/python/mmlspark/cntk/ImageFeaturizer.py index 9c4ae54a29e..a85cd56a095 100644 --- a/src/main/python/mmlspark/image/ImageFeaturizer.py +++ b/deep-learning/src/main/python/mmlspark/cntk/ImageFeaturizer.py @@ -6,7 +6,7 @@ if sys.version >= '3': basestring = str -from mmlspark.image._ImageFeaturizer import _ImageFeaturizer +from mmlspark.cntk._ImageFeaturizer import _ImageFeaturizer from pyspark.ml.common import inherit_doc from pyspark.sql import SparkSession diff --git a/src/test/python/mmlsparktest/__init__.py b/deep-learning/src/main/python/mmlspark/cntk/__init__.py similarity index 100% rename from src/test/python/mmlsparktest/__init__.py rename to deep-learning/src/main/python/mmlspark/cntk/__init__.py diff --git a/src/main/scala/com/microsoft/CNTK/SerializableFunction.scala b/deep-learning/src/main/scala/com/microsoft/CNTK/SerializableFunction.scala similarity index 100% rename from src/main/scala/com/microsoft/CNTK/SerializableFunction.scala rename to deep-learning/src/main/scala/com/microsoft/CNTK/SerializableFunction.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cntk/CNTKFunctionParam.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/CNTKFunctionParam.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cntk/CNTKFunctionParam.scala rename to deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/CNTKFunctionParam.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cntk/CNTKModel.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/CNTKModel.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cntk/CNTKModel.scala rename to deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/CNTKModel.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cntk/ConversionUtils.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/ConversionUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cntk/ConversionUtils.scala rename to deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/ConversionUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/ImageFeaturizer.scala similarity index 98% rename from src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.scala rename to deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/ImageFeaturizer.scala index 2db42e83b0c..73dce569944 100644 --- a/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.scala +++ b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/ImageFeaturizer.scala @@ -1,20 +1,20 @@ // Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. -package com.microsoft.ml.spark.image +package com.microsoft.ml.spark.cntk import com.microsoft.CNTK.CNTKExtensions._ import com.microsoft.CNTK.{SerializableFunction => CNTKFunction} -import com.microsoft.ml.spark.cntk.CNTKModel -import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol} import com.microsoft.ml.spark.codegen.Wrappable +import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol} import com.microsoft.ml.spark.core.schema.{DatasetExtensions, ImageSchemaUtils} import com.microsoft.ml.spark.downloader.ModelSchema +import com.microsoft.ml.spark.image.{ResizeImageTransformer, UnrollBinaryImage, UnrollImage} import com.microsoft.ml.spark.logging.BasicLogging -import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable +import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} import org.apache.spark.sql.types.{BinaryType, StructType} import org.apache.spark.sql.{DataFrame, Dataset} @@ -132,7 +132,7 @@ class ImageFeaturizer(val uid: String) extends Transformer with HasInputCol with /** @group getParam */ def getLayerNames: Array[String] = $(layerNames) - setDefault(cutOutputLayers -> 1, outputCol -> (uid + "_output"), dropNa->true) + setDefault(cutOutputLayers -> 1, outputCol -> (uid + "_output"), dropNa -> true) override def transform(dataset: Dataset[_]): DataFrame = { logTransform[DataFrame]({ diff --git a/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.txt b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/ImageFeaturizer.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.txt rename to deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/ImageFeaturizer.txt diff --git a/src/main/scala/com/microsoft/ml/spark/cntk/_CNTKModel.txt b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/_CNTKModel.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cntk/_CNTKModel.txt rename to deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/_CNTKModel.txt diff --git a/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala similarity index 89% rename from src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala rename to deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala index 3b68d0ee507..8c2a46c55e6 100644 --- a/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala +++ b/deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala @@ -7,6 +7,7 @@ import java.io._ import java.net.{URI, URL} import java.util +import com.microsoft.ml.spark.core.utils.FaultToleranceUtils import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.{Configuration => HadoopConf} import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path} @@ -15,10 +16,8 @@ import org.apache.log4j.LogManager import org.apache.spark.sql.SparkSession import spray.json._ -import scala.annotation.tailrec import scala.collection.JavaConverters._ -import scala.concurrent.duration.{Duration, FiniteDuration} -import scala.concurrent.{Await, ExecutionContext, Future} +import scala.concurrent.duration.Duration /** Abstract representation of a repository for future expansion * @@ -34,32 +33,6 @@ private[spark] abstract class Repository[S <: Schema] { } -object FaultToleranceUtils { - def retryWithTimeout[T](times: Int, timeout: Duration)(f: => T): T ={ - try { - Await.result(Future(f)(ExecutionContext.global), timeout) - } catch { - case e: Exception if times >= 1 => - print(s"Received exception on call, retrying: $e") - retryWithTimeout(times-1, timeout)(f) - } - } - - val Backoffs: Seq[Int] = Seq(0, 100, 200, 500) - - def retryWithTimeout[T](times: Seq[Int] = Backoffs)(f: => T): T ={ - try { - f - } catch { - case e: Exception if times.nonEmpty => - println(s"Received exception on call, retrying: $e") - Thread.sleep(times.head) - retryWithTimeout(times.tail)(f) - } - } - -} - /** Exception returned if a repo cannot find the file * * @param uri : location of the file @@ -90,7 +63,7 @@ private[spark] class HDFSRepo[S <: Schema](val uri: URI, val hconf: HadoopConf) .filter(status => status.isFile & status.getPath.toString.endsWith(".meta")) .map(status => - IOUtils.toString(fs.open(status.getPath).getWrappedStream)) + IOUtils.toString(fs.open(status.getPath).getWrappedStream, "UTF-8")) schemaStrings.map(s => s.parseJson.convertTo[S]).toList } @@ -121,7 +94,7 @@ private[spark] class HDFSRepo[S <: Schema](val uri: URI, val hconf: HadoopConf) val newSchema = schema.updateURI(location) val schemaPath = new Path(location.getPath + ".meta") val osSchema = fs.create(schemaPath) - val schemaIs = IOUtils.toInputStream(newSchema.toJson.prettyPrint) + val schemaIs = IOUtils.toInputStream(newSchema.toJson.prettyPrint, "UTF-8") try { HUtils.copyBytes(schemaIs, osSchema, hconf) } finally { @@ -157,9 +130,9 @@ private[spark] class DefaultModelRepo(val baseURL: URL) extends Repository[Model val url = join(baseURL, "MANIFEST") val manifestStream = toStream(url) try { - val modelStreams = IOUtils.readLines(manifestStream).asScala.map(fn => toStream(join(baseURL, fn))) + val modelStreams = IOUtils.readLines(manifestStream, "UTF-8").asScala.map(fn => toStream(join(baseURL, fn))) try { - modelStreams.map(s => IOUtils.toString(s).parseJson.convertTo[ModelSchema]) + modelStreams.map(s => IOUtils.toString(s, "UTF-8").parseJson.convertTo[ModelSchema]) } finally { modelStreams.foreach(_.close()) } diff --git a/src/main/scala/com/microsoft/ml/spark/downloader/Schema.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/Schema.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/downloader/Schema.scala rename to deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/Schema.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala similarity index 97% rename from src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala rename to deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala index 37b4b1ad615..f8483945360 100644 --- a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala +++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala @@ -9,11 +9,12 @@ import com.microsoft.CNTK.CNTKExtensions._ import com.microsoft.CNTK.{SerializableFunction => CNTKFunction, _} import com.microsoft.ml.spark.core.env.StreamUtilities._ import com.microsoft.ml.spark.core.test.base.LinuxOnly +import com.microsoft.ml.spark.image.ImageTestUtils import org.apache.commons.io.IOUtils import scala.collection.JavaConverters._ -class CNTKBindingSuite extends LinuxOnly with CNTKTestUtils { +class CNTKBindingSuite extends LinuxOnly with ImageTestUtils { def toSeqSeq(fvv: FloatVectorVector): Seq[Seq[Float]] = { (0 until fvv.size.toInt).map(i => diff --git a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala similarity index 97% rename from src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala rename to deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala index 34893a7015c..8d2285be0ad 100644 --- a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala +++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala @@ -10,6 +10,7 @@ import com.microsoft.ml.spark.build.BuildInfo import com.microsoft.ml.spark.core.env.FileUtilities import com.microsoft.ml.spark.core.test.base.LinuxOnly import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} +import com.microsoft.ml.spark.image.ImageTestUtils import org.apache.commons.io.FileUtils import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.linalg.DenseVector @@ -21,7 +22,7 @@ import org.apache.spark.sql.types._ import scala.util.Random -class CNTKModelSuite extends LinuxOnly with CNTKTestUtils with TransformerFuzzing[CNTKModel] { +class CNTKModelSuite extends LinuxOnly with ImageTestUtils with TransformerFuzzing[CNTKModel] { // TODO: Move away from getTempDirectoryPath and have TestBase provide one @@ -54,7 +55,7 @@ class CNTKModelSuite extends LinuxOnly with CNTKTestUtils with TransformerFuzzin .setOutputNodeIndex(0) } - lazy val images = testImages(spark) + override lazy val images = testImages(spark) import spark.implicits._ diff --git a/src/test/scala/com/microsoft/ml/spark/image/ImageFeaturizerSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/ImageFeaturizerSuite.scala similarity index 80% rename from src/test/scala/com/microsoft/ml/spark/image/ImageFeaturizerSuite.scala rename to deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/ImageFeaturizerSuite.scala index 247c7a421e1..1f9ca641c5a 100644 --- a/src/test/scala/com/microsoft/ml/spark/image/ImageFeaturizerSuite.scala +++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/ImageFeaturizerSuite.scala @@ -1,31 +1,28 @@ // Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. -package com.microsoft.ml.spark.image +package com.microsoft.ml.spark.cntk import java.io.File -import java.net.{URI, URL} +import java.net.URI import com.microsoft.ml.spark.Secrets import com.microsoft.ml.spark.build.BuildInfo -import com.microsoft.ml.spark.cntk.CNTKTestUtils import com.microsoft.ml.spark.core.env.FileUtilities -import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} -import com.microsoft.ml.spark.core.utils.ModelEquality import com.microsoft.ml.spark.downloader.{ModelDownloader, ModelSchema} +import com.microsoft.ml.spark.image.ImageTestUtils import com.microsoft.ml.spark.io.IOImplicits._ import com.microsoft.ml.spark.io.powerbi.PowerBIWriter import com.microsoft.ml.spark.io.split1.FileReaderUtils -import org.apache.commons.io.FileUtils import org.apache.spark.injections.UDFUtils import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.functions.{col, udf} +import org.apache.spark.sql.functions.col import org.apache.spark.sql.types.StringType -trait NetworkUtils extends CNTKTestUtils with FileReaderUtils { +trait TrainedCNTKModelUtils extends ImageTestUtils with FileReaderUtils { lazy val modelDir = new File(filesRoot, "CNTKModel") lazy val modelDownloader = new ModelDownloader(spark, modelDir.toURI) @@ -33,33 +30,6 @@ trait NetworkUtils extends CNTKTestUtils with FileReaderUtils { lazy val resNetUri: URI = new File(modelDir, "ResNet50_ImageNet.model").toURI lazy val resNet: ModelSchema = modelDownloader.downloadByName("ResNet50") - lazy val images: DataFrame = spark.read.image.load(imagePath) - .withColumnRenamed("image", inputCol) - lazy val binaryImages: DataFrame = spark.read.binary.load(imagePath) - .select(col("value.bytes").alias(inputCol)) - - lazy val groceriesPath = FileUtilities.join(BuildInfo.datasetDir, "Images","Grocery") - lazy val groceryImages: DataFrame = spark.read.image - .option("dropInvalid", true) - .load(groceriesPath + "**") - .withColumnRenamed("image", inputCol) - - lazy val greyscaleImageLocation: String = { - val loc = "/tmp/greyscale.jpg" - val f = new File(loc) - if (f.exists()) {f.delete()} - FileUtils.copyURLToFile(new URL("https://mmlspark.blob.core.windows.net/datasets/LIME/greyscale.jpg"), f) - loc - } - - lazy val greyscaleImage: DataFrame = spark - .read.image.load(greyscaleImageLocation) - .select(col("image").alias(inputCol)) - - lazy val greyscaleBinary: DataFrame = spark - .read.binary.load(greyscaleImageLocation) - .select(col("value.bytes").alias(inputCol)) - def resNetModel(): ImageFeaturizer = new ImageFeaturizer() .setInputCol(inputCol) .setOutputCol(outputCol) @@ -68,7 +38,7 @@ trait NetworkUtils extends CNTKTestUtils with FileReaderUtils { } class ImageFeaturizerSuite extends TransformerFuzzing[ImageFeaturizer] - with NetworkUtils { + with TrainedCNTKModelUtils { test("Image featurizer should reproduce the CIFAR10 experiment") { print(spark) diff --git a/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala similarity index 97% rename from src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala rename to deep-learning/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala index ee6d53933a0..f67e4b82d5c 100644 --- a/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala +++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala @@ -7,6 +7,7 @@ import java.io.File import java.nio.file.Files import com.microsoft.ml.spark.core.test.base.TestBase +import com.microsoft.ml.spark.core.utils.FaultToleranceUtils import org.apache.commons.io.FileUtils import scala.collection.JavaConverters._ diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/ImageExplainersSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/explainers/ImageExplainersSuite.scala similarity index 86% rename from src/test/scala/com/microsoft/ml/spark/explainers/ImageExplainersSuite.scala rename to deep-learning/src/test/scala/com/microsoft/ml/spark/explainers/ImageExplainersSuite.scala index ae7103f2bd1..94e7d9aeb57 100644 --- a/src/test/scala/com/microsoft/ml/spark/explainers/ImageExplainersSuite.scala +++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/explainers/ImageExplainersSuite.scala @@ -3,16 +3,16 @@ package com.microsoft.ml.spark.explainers +import java.io.File +import java.net.URL + +import com.microsoft.ml.spark.cntk.{ImageFeaturizer, TrainedCNTKModelUtils} import com.microsoft.ml.spark.core.test.base.TestBase -import com.microsoft.ml.spark.image.{ImageFeaturizer, NetworkUtils} import com.microsoft.ml.spark.io.IOImplicits._ import org.apache.commons.io.FileUtils import org.apache.spark.sql.DataFrame -import java.io.File -import java.net.URL - -abstract class ImageExplainersSuite extends TestBase with NetworkUtils { +abstract class ImageExplainersSuite extends TestBase with TrainedCNTKModelUtils { lazy val greyhoundImageLocation: String = { val loc = "/tmp/greyhound.jpg" val f = new File(loc) diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/split3/ImageLIMEExplainerSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/explainers/split2/ImageLIMEExplainerSuite.scala similarity index 98% rename from src/test/scala/com/microsoft/ml/spark/explainers/split3/ImageLIMEExplainerSuite.scala rename to deep-learning/src/test/scala/com/microsoft/ml/spark/explainers/split2/ImageLIMEExplainerSuite.scala index 41bc9b21ab2..131b69f6fdb 100644 --- a/src/test/scala/com/microsoft/ml/spark/explainers/split3/ImageLIMEExplainerSuite.scala +++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/explainers/split2/ImageLIMEExplainerSuite.scala @@ -1,13 +1,13 @@ // Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. -package com.microsoft.ml.spark.explainers.split3 +package com.microsoft.ml.spark.explainers.split2 import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import com.microsoft.ml.spark.explainers.BreezeUtils._ import com.microsoft.ml.spark.explainers.{ImageExplainersSuite, ImageFormat, ImageLIME, LocalExplainer} -import com.microsoft.ml.spark.lime.SuperpixelData import com.microsoft.ml.spark.io.IOImplicits._ +import com.microsoft.ml.spark.lime.SuperpixelData import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.functions.col diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/split2/ImageSHAPExplainerSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/explainers/split3/ImageSHAPExplainerSuite.scala similarity index 97% rename from src/test/scala/com/microsoft/ml/spark/explainers/split2/ImageSHAPExplainerSuite.scala rename to deep-learning/src/test/scala/com/microsoft/ml/spark/explainers/split3/ImageSHAPExplainerSuite.scala index 59fba17bb7a..1de490a4a8e 100644 --- a/src/test/scala/com/microsoft/ml/spark/explainers/split2/ImageSHAPExplainerSuite.scala +++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/explainers/split3/ImageSHAPExplainerSuite.scala @@ -1,11 +1,11 @@ // Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. -package com.microsoft.ml.spark.explainers.split2 +package com.microsoft.ml.spark.explainers.split3 import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} -import com.microsoft.ml.spark.explainers.{ImageExplainersSuite, ImageFormat, ImageSHAP, LocalExplainer} import com.microsoft.ml.spark.explainers.BreezeUtils._ +import com.microsoft.ml.spark.explainers.{ImageExplainersSuite, ImageFormat, ImageSHAP, LocalExplainer} import com.microsoft.ml.spark.lime.SuperpixelData import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.util.MLReadable diff --git a/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/lime/ImageLIMESuite.scala similarity index 65% rename from src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala rename to deep-learning/src/test/scala/com/microsoft/ml/spark/lime/ImageLIMESuite.scala index e83f910e377..b53d206137c 100644 --- a/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala +++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/lime/ImageLIMESuite.scala @@ -7,82 +7,23 @@ import java.awt.image.BufferedImage import java.io.File import java.net.URL -import breeze.linalg.{*, DenseMatrix} -import breeze.stats.distributions.Rand -import com.microsoft.ml.spark.core.test.base.TestBase -import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject, TransformerFuzzing} -import com.microsoft.ml.spark.image.{ImageFeaturizer, NetworkUtils} +import com.microsoft.ml.spark.cntk.{ImageFeaturizer, TrainedCNTKModelUtils} +import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import com.microsoft.ml.spark.io.IOImplicits._ import com.microsoft.ml.spark.io.image.ImageUtils import com.microsoft.ml.spark.io.split1.FileReaderUtils import com.microsoft.ml.spark.stages.UDFTransformer import com.microsoft.ml.spark.stages.udfs.get_value_udf import org.apache.commons.io.FileUtils -import org.apache.spark.injections.UDFUtils import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.ml.param.DataFrameEquality -import org.apache.spark.ml.regression.LinearRegression import org.apache.spark.ml.util.MLReadable import org.apache.spark.ml.{NamespaceInjections, PipelineModel} import org.apache.spark.sql.functions.col -import org.apache.spark.sql.types.DoubleType import org.apache.spark.sql.{DataFrame, Row} -trait LimeTestBase extends TestBase { - - import spark.implicits._ - - lazy val nRows = 100 - lazy val d1 = 3 - lazy val d2 = 1 - - lazy val m: DenseMatrix[Double] = new DenseMatrix(d1, d2, Array(1.0, -1.0, 2.0)) - lazy val x: DenseMatrix[Double] = DenseMatrix.rand(nRows, d1, Rand.gaussian) - lazy val noise: DenseMatrix[Double] = DenseMatrix.rand(nRows, d2, Rand.gaussian) * 0.1 - lazy val y = x * m //+ noise - - lazy val xRows = x(*, ::).iterator.toSeq.map(dv => new DenseVector(dv.toArray)) - lazy val yRows = y(*, ::).iterator.toSeq.map(dv => dv(0)) - lazy val df = xRows.zip(yRows).toDF("features", "label") - - lazy val model = new LinearRegression().fit(df) - - lazy val lime = new TabularLIME() - .setModel(model) - .setInputCol("features") - .setPredictionCol(model.getPredictionCol) - .setOutputCol("out") - .setNSamples(1000) - - lazy val limeModel = lime.fit(df) -} - -class TabularLIMESuite extends EstimatorFuzzing[TabularLIME] with - DataFrameEquality with LimeTestBase { - - test("text lime usage test check") { - val results = limeModel.transform(df).select("out") - .collect().map(_.getAs[DenseVector](0)) - results.foreach(result => assert(result === new DenseVector(m.data))) - } - - override def testObjects(): Seq[TestObject[TabularLIME]] = Seq(new TestObject(lime, df)) - - override def reader: MLReadable[_] = TabularLIME - - override def modelReader: MLReadable[_] = TabularLIMEModel -} - -class TabularLIMEModelSuite extends TransformerFuzzing[TabularLIMEModel] with - DataFrameEquality with LimeTestBase { - - override def testObjects(): Seq[TestObject[TabularLIMEModel]] = Seq(new TestObject(limeModel, df)) - - override def reader: MLReadable[_] = TabularLIMEModel -} - class ImageLIMESuite extends TransformerFuzzing[ImageLIME] with - DataFrameEquality with NetworkUtils with FileReaderUtils { + DataFrameEquality with TrainedCNTKModelUtils with FileReaderUtils { lazy val greyhoundImageLocation: String = { val loc = "/tmp/greyhound.jpg" diff --git a/docs/cogsvc.md b/docs/cogsvc.md index edec95f3751..949ae14c96a 100644 --- a/docs/cogsvc.md +++ b/docs/cogsvc.md @@ -9,7 +9,7 @@ Azure Cognitive Services on Spark enable working with Azure’s Intelligent Services at massive scales with the Apache Spark™ distributed computing ecosystem. Cognitive Services on Spark allows users to embed general purpose and continuously improving intelligent models directly into their Apache Spark™ and SQL computations. This liberates developers from low-level networking details, so they can focus on creating intelligent, distributed applications. Each Cognitive Service acts as a SparkML transformer, so users can add services to existing SparkML pipelines. This is a great example of our [HTTP-on-Spark](http.md) capability that lets you interact with HTTP services from Apache Spark. ## Usage -To see an example of Cognitive Services on Spark in action, take a look at [this sample notebook](../notebooks/samples/CognitiveServices%20-%20Celebrity%20Quote%20Analysis.ipynb). +To see an example of Cognitive Services on Spark in action, take a look at [this sample notebook](../notebooks/CognitiveServices%20-%20Celebrity%20Quote%20Analysis.ipynb). ## Cognitive Services on Apache Spark™ Currently, the following Cognitive Services are available on Apache Spark™ through MMLSpark: diff --git a/docs/datasets.md b/docs/datasets.md index 8376027f4f4..595ae3d4098 100644 --- a/docs/datasets.md +++ b/docs/datasets.md @@ -24,7 +24,7 @@ tab-separated file with 2 columns (`rating`, `text`) and 10000 rows. The contains free-form text strings in English language. You can use `mmlspark.TextFeaturizer` to convert the text into feature vectors for machine learning models ([see -example](../notebooks/samples/201%20-%20Amazon%20Book%20Reviews%20-%20TextFeaturizer.ipynb)). +example](../notebooks/201%20-%20Amazon%20Book%20Reviews%20-%20TextFeaturizer.ipynb)). The example dataset is available [here](https://mmlspark.azureedge.net/datasets/BookReviewsFromAmazon10K.tsv); @@ -48,7 +48,7 @@ The example dataset is available the original dataset is available [Krizhevsky's page](https://www.cs.toronto.edu/~kriz/cifar.html). The dataset has been packaged into a gzipped tar archive. See notebook [301 - CIFAR10 CNTK CNN -Evaluation](../notebooks/samples/301%20-%20CIFAR10%20CNTK%20CNN%20Evaluation.ipynb) +Evaluation](../notebooks/301%20-%20CIFAR10%20CNTK%20CNN%20Evaluation.ipynb) for an example how to extract the image data. Reference: [_Learning Multiple Layers of Features from Tiny diff --git a/docs/lightgbm.md b/docs/lightgbm.md index fed5bc34131..87d5c366f2e 100644 --- a/docs/lightgbm.md +++ b/docs/lightgbm.md @@ -49,7 +49,7 @@ model = LightGBMRegressor(application='quantile', ``` For an end to end application, check out the LightGBM [notebook -example](../notebooks/samples/LightGBM%20-%20Quantile%20Regression%20for%20Drug%20Discovery.ipynb). +example](../notebooks/LightGBM%20-%20Quantile%20Regression%20for%20Drug%20Discovery.ipynb). ### Architecture diff --git a/docs/mmlspark-serving.md b/docs/mmlspark-serving.md index d59e3e0c58a..9471644805f 100644 --- a/docs/mmlspark-serving.md +++ b/docs/mmlspark-serving.md @@ -25,7 +25,7 @@ ### Jupyter Notebook Examples -- [Deploy a classifier trained on the Adult Census Dataset](../notebooks/samples/SparkServing%20-%20Deploying%20a%20Classifier.ipynb) +- [Deploy a classifier trained on the Adult Census Dataset](../notebooks/SparkServing%20-%20Deploying%20a%20Classifier.ipynb) - More coming soon! ### Spark Serving Hello World diff --git a/docs/vw.md b/docs/vw.md index 6deaeedf089..ddb0b7f6920 100644 --- a/docs/vw.md +++ b/docs/vw.md @@ -58,7 +58,7 @@ model = (VowpalWabbitRegressor(args="--holdout_off --loss_function quantile -q : Through the args parameter you can pass command line parameters to VW as documented in the [VW Wiki](https://github.com/vowpalWabbit/vowpal_wabbit/wiki/Command-Line-Arguments). For an end to end application, check out the VowpalWabbit [notebook -example](../notebooks/samples/Vowpal%20Wabbit%20-%20Quantile%20Regression%20for%20Drug%20Discovery.ipynb]). +example](../notebooks/Vowpal%20Wabbit%20-%20Quantile%20Regression%20for%20Drug%20Discovery.ipynb]). ### Hyper-parameter tuning diff --git a/environment.yaml b/environment.yaml index 1c1994e7858..338862d1001 100644 --- a/environment.yaml +++ b/environment.yaml @@ -6,6 +6,7 @@ dependencies: - python=3.8.8 - pyspark=3.1.2 - requests + - pip - r-base - r-dplyr - r-sparklyr diff --git a/src/main/python/mmlspark/lightgbm/LightGBMClassificationModel.py b/lightgbm/src/main/python/mmlspark/lightgbm/LightGBMClassificationModel.py similarity index 100% rename from src/main/python/mmlspark/lightgbm/LightGBMClassificationModel.py rename to lightgbm/src/main/python/mmlspark/lightgbm/LightGBMClassificationModel.py diff --git a/src/main/python/mmlspark/lightgbm/LightGBMRankerModel.py b/lightgbm/src/main/python/mmlspark/lightgbm/LightGBMRankerModel.py similarity index 100% rename from src/main/python/mmlspark/lightgbm/LightGBMRankerModel.py rename to lightgbm/src/main/python/mmlspark/lightgbm/LightGBMRankerModel.py diff --git a/src/main/python/mmlspark/lightgbm/LightGBMRegressionModel.py b/lightgbm/src/main/python/mmlspark/lightgbm/LightGBMRegressionModel.py similarity index 100% rename from src/main/python/mmlspark/lightgbm/LightGBMRegressionModel.py rename to lightgbm/src/main/python/mmlspark/lightgbm/LightGBMRegressionModel.py diff --git a/src/test/python/mmlsparktest/cognitive/__init__.py b/lightgbm/src/main/python/mmlspark/lightgbm/__init__.py similarity index 100% rename from src/test/python/mmlsparktest/cognitive/__init__.py rename to lightgbm/src/main/python/mmlspark/lightgbm/__init__.py diff --git a/src/main/python/mmlspark/lightgbm/mixin.py b/lightgbm/src/main/python/mmlspark/lightgbm/mixin.py similarity index 100% rename from src/main/python/mmlspark/lightgbm/mixin.py rename to lightgbm/src/main/python/mmlspark/lightgbm/mixin.py diff --git a/src/main/scala/com/microsoft/lightgbm/SWIG.scala b/lightgbm/src/main/scala/com/microsoft/lightgbm/SWIG.scala similarity index 100% rename from src/main/scala/com/microsoft/lightgbm/SWIG.scala rename to lightgbm/src/main/scala/com/microsoft/lightgbm/SWIG.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMBase.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMBase.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMBase.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMBase.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.txt b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.txt rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.txt diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMConstants.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMConstants.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMConstants.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMConstants.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMDelegate.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMDelegate.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMDelegate.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMDelegate.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMModelMethods.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMModelMethods.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMModelMethods.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMModelMethods.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.txt b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.txt rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.txt diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.txt b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.txt rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.txt diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMUtils.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMUtils.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/PartitionProcessor.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/PartitionProcessor.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/PartitionProcessor.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/PartitionProcessor.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala similarity index 99% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala index eddce5cc295..1c420b1d063 100644 --- a/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala +++ b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala @@ -8,7 +8,7 @@ import java.net._ import com.microsoft.ml.lightgbm._ import com.microsoft.ml.spark.core.env.StreamUtilities._ -import com.microsoft.ml.spark.downloader.FaultToleranceUtils +import com.microsoft.ml.spark.core.utils.FaultToleranceUtils import com.microsoft.ml.spark.lightgbm.booster.LightGBMBooster import com.microsoft.ml.spark.lightgbm.dataset.{DatasetUtils, LightGBMDataset} import com.microsoft.ml.spark.lightgbm.params.{ClassifierTrainParams, TrainParams} diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/booster/LightGBMBooster.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/booster/LightGBMBooster.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/booster/LightGBMBooster.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/booster/LightGBMBooster.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/DatasetUtils.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/DatasetUtils.scala similarity index 98% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/DatasetUtils.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/DatasetUtils.scala index a404a42e37f..02ba5b698e1 100644 --- a/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/DatasetUtils.scala +++ b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/DatasetUtils.scala @@ -134,8 +134,9 @@ object DatasetUtils { /** * Sample the first several rows to determine whether to construct sparse or dense matrix in lightgbm native code. - * @param rowsIter Iterator of rows. - * @param schema The schema. + * + * @param rowsIter Iterator of rows. + * @param schema The schema. * @param columnParams The column parameters. * @return A reconstructed iterator with the same original rows and whether the matrix should be sparse or dense. */ @@ -158,7 +159,7 @@ object DatasetUtils { } def addFeaturesToChunkedArray(featuresChunkedArrayOpt: Option[doubleChunkedArray], numCols: Int, - rowAsDoubleArray: Array[Double]): Unit = { + rowAsDoubleArray: Array[Double]): Unit = { featuresChunkedArrayOpt.foreach { featuresChunkedArray => rowAsDoubleArray.foreach { doubleVal => featuresChunkedArray.add(doubleVal) diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/LightGBMDataset.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/LightGBMDataset.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/LightGBMDataset.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/LightGBMDataset.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjParam.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjParam.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjParam.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjParam.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjTrait.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjTrait.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjTrait.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjTrait.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMBoosterParam.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMBoosterParam.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMBoosterParam.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMBoosterParam.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMParams.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMParams.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMParams.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMParams.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/TrainParams.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/TrainParams.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/TrainParams.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/TrainParams.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/swig/SwigUtils.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/swig/SwigUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/swig/SwigUtils.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/swig/SwigUtils.scala diff --git a/src/test/resources/benchmarks/benchmarks_VerifyLightGBMClassifier.csv b/lightgbm/src/test/resources/benchmarks/benchmarks_VerifyLightGBMClassifier.csv similarity index 100% rename from src/test/resources/benchmarks/benchmarks_VerifyLightGBMClassifier.csv rename to lightgbm/src/test/resources/benchmarks/benchmarks_VerifyLightGBMClassifier.csv diff --git a/src/test/resources/benchmarks/benchmarks_VerifyLightGBMRegressor.csv b/lightgbm/src/test/resources/benchmarks/benchmarks_VerifyLightGBMRegressor.csv similarity index 100% rename from src/test/resources/benchmarks/benchmarks_VerifyLightGBMRegressor.csv rename to lightgbm/src/test/resources/benchmarks/benchmarks_VerifyLightGBMRegressor.csv diff --git a/src/test/scala/com/microsoft/ml/spark/lightgbm/split1/VerifyLightGBMClassifier.scala b/lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split1/VerifyLightGBMClassifier.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/lightgbm/split1/VerifyLightGBMClassifier.scala rename to lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split1/VerifyLightGBMClassifier.scala diff --git a/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRanker.scala b/lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRanker.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRanker.scala rename to lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRanker.scala diff --git a/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRegressor.scala b/lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRegressor.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRegressor.scala rename to lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRegressor.scala diff --git a/notebooks/samples/AzureSearchIndex - Met Artworks.ipynb b/notebooks/AzureSearchIndex - Met Artworks.ipynb similarity index 100% rename from notebooks/samples/AzureSearchIndex - Met Artworks.ipynb rename to notebooks/AzureSearchIndex - Met Artworks.ipynb diff --git a/notebooks/samples/Classification - Adult Census with Vowpal Wabbit.ipynb b/notebooks/Classification - Adult Census with Vowpal Wabbit.ipynb similarity index 98% rename from notebooks/samples/Classification - Adult Census with Vowpal Wabbit.ipynb rename to notebooks/Classification - Adult Census with Vowpal Wabbit.ipynb index e7098605ccc..4608bce764e 100644 --- a/notebooks/samples/Classification - Adult Census with Vowpal Wabbit.ipynb +++ b/notebooks/Classification - Adult Census with Vowpal Wabbit.ipynb @@ -8,7 +8,7 @@ "# Classification - Adult Census using Vowpal Wabbit in MMLSpark\n", "\n", "In this example, we predict incomes from the *Adult Census* dataset using Vowpal Wabbit (VW) classifier in MMLSpark.\n", - "First, we read the data and split it into train and test sets as in this [example](https://github.com/Azure/mmlspark/blob/master/notebooks/samples/Classification%20-%20Adult%20Census.ipynb\n", + "First, we read the data and split it into train and test sets as in this [example](https://github.com/Azure/mmlspark/blob/master/notebooks/Classification%20-%20Adult%20Census.ipynb\n", ")." ] }, diff --git a/notebooks/samples/Classification - Adult Census.ipynb b/notebooks/Classification - Adult Census.ipynb similarity index 100% rename from notebooks/samples/Classification - Adult Census.ipynb rename to notebooks/Classification - Adult Census.ipynb diff --git a/notebooks/samples/Classification - Before and After MMLSpark.ipynb b/notebooks/Classification - Before and After MMLSpark.ipynb similarity index 100% rename from notebooks/samples/Classification - Before and After MMLSpark.ipynb rename to notebooks/Classification - Before and After MMLSpark.ipynb diff --git a/notebooks/samples/Classification - Twitter Sentiment with Vowpal Wabbit.ipynb b/notebooks/Classification - Twitter Sentiment with Vowpal Wabbit.ipynb similarity index 100% rename from notebooks/samples/Classification - Twitter Sentiment with Vowpal Wabbit.ipynb rename to notebooks/Classification - Twitter Sentiment with Vowpal Wabbit.ipynb diff --git a/notebooks/samples/Cognitive Services - Overview.ipynb b/notebooks/Cognitive Services - Overview.ipynb similarity index 100% rename from notebooks/samples/Cognitive Services - Overview.ipynb rename to notebooks/Cognitive Services - Overview.ipynb diff --git a/notebooks/samples/CognitiveServices - Celebrity Quote Analysis.ipynb b/notebooks/CognitiveServices - Celebrity Quote Analysis.ipynb similarity index 100% rename from notebooks/samples/CognitiveServices - Celebrity Quote Analysis.ipynb rename to notebooks/CognitiveServices - Celebrity Quote Analysis.ipynb diff --git a/notebooks/samples/ConditionalKNN - Exploring Art Across Cultures.ipynb b/notebooks/ConditionalKNN - Exploring Art Across Cultures.ipynb similarity index 100% rename from notebooks/samples/ConditionalKNN - Exploring Art Across Cultures.ipynb rename to notebooks/ConditionalKNN - Exploring Art Across Cultures.ipynb diff --git a/notebooks/samples/CyberML - Anomalous Access Detection.ipynb b/notebooks/CyberML - Anomalous Access Detection.ipynb similarity index 100% rename from notebooks/samples/CyberML - Anomalous Access Detection.ipynb rename to notebooks/CyberML - Anomalous Access Detection.ipynb diff --git a/notebooks/samples/DeepLearning - BiLSTM Medical Entity Extraction.ipynb b/notebooks/DeepLearning - BiLSTM Medical Entity Extraction.ipynb similarity index 100% rename from notebooks/samples/DeepLearning - BiLSTM Medical Entity Extraction.ipynb rename to notebooks/DeepLearning - BiLSTM Medical Entity Extraction.ipynb diff --git a/notebooks/samples/DeepLearning - CIFAR10 Convolutional Network.ipynb b/notebooks/DeepLearning - CIFAR10 Convolutional Network.ipynb similarity index 100% rename from notebooks/samples/DeepLearning - CIFAR10 Convolutional Network.ipynb rename to notebooks/DeepLearning - CIFAR10 Convolutional Network.ipynb diff --git a/notebooks/samples/DeepLearning - Flower Image Classification.ipynb b/notebooks/DeepLearning - Flower Image Classification.ipynb similarity index 98% rename from notebooks/samples/DeepLearning - Flower Image Classification.ipynb rename to notebooks/DeepLearning - Flower Image Classification.ipynb index 0b6100ae81f..165bd30ce17 100644 --- a/notebooks/samples/DeepLearning - Flower Image Classification.ipynb +++ b/notebooks/DeepLearning - Flower Image Classification.ipynb @@ -51,7 +51,8 @@ "outputs": [], "source": [ "from mmlspark.opencv import ImageTransformer\n", - "from mmlspark.image import UnrollImage, ImageFeaturizer\n", + "from mmlspark.image import UnrollImage\n", + "from mmlspark.cntk import ImageFeaturizer\n", "from mmlspark.stages import *\n", "\n", "# Make some featurizers\n", @@ -220,4 +221,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/notebooks/samples/DeepLearning - Transfer Learning.ipynb b/notebooks/DeepLearning - Transfer Learning.ipynb similarity index 100% rename from notebooks/samples/DeepLearning - Transfer Learning.ipynb rename to notebooks/DeepLearning - Transfer Learning.ipynb diff --git a/notebooks/samples/HttpOnSpark - Working with Arbitrary Web APIs.ipynb b/notebooks/HttpOnSpark - Working with Arbitrary Web APIs.ipynb similarity index 100% rename from notebooks/samples/HttpOnSpark - Working with Arbitrary Web APIs.ipynb rename to notebooks/HttpOnSpark - Working with Arbitrary Web APIs.ipynb diff --git a/notebooks/samples/HyperParameterTuning - Fighting Breast Cancer.ipynb b/notebooks/HyperParameterTuning - Fighting Breast Cancer.ipynb similarity index 100% rename from notebooks/samples/HyperParameterTuning - Fighting Breast Cancer.ipynb rename to notebooks/HyperParameterTuning - Fighting Breast Cancer.ipynb diff --git a/notebooks/samples/LightGBM - Overview.ipynb b/notebooks/LightGBM - Overview.ipynb similarity index 100% rename from notebooks/samples/LightGBM - Overview.ipynb rename to notebooks/LightGBM - Overview.ipynb diff --git a/notebooks/samples/ModelInterpretation - Snow Leopard Detection.ipynb b/notebooks/ModelInterpretation - Snow Leopard Detection.ipynb similarity index 99% rename from notebooks/samples/ModelInterpretation - Snow Leopard Detection.ipynb rename to notebooks/ModelInterpretation - Snow Leopard Detection.ipynb index 097cb3dee18..4be5c881bc3 100644 --- a/notebooks/samples/ModelInterpretation - Snow Leopard Detection.ipynb +++ b/notebooks/ModelInterpretation - Snow Leopard Detection.ipynb @@ -198,7 +198,7 @@ "from pyspark.ml.classification import LogisticRegression\n", "from pyspark.sql.functions import udf\n", "from mmlspark.downloader import ModelDownloader\n", - "from mmlspark.image import ImageFeaturizer \n", + "from mmlspark.cntk import ImageFeaturizer\n", "from mmlspark.stages import UDFTransformer\n", "from pyspark.sql.types import *\n", "\n", diff --git a/notebooks/samples/OpenCV - Pipeline Image Transformations.ipynb b/notebooks/OpenCV - Pipeline Image Transformations.ipynb similarity index 100% rename from notebooks/samples/OpenCV - Pipeline Image Transformations.ipynb rename to notebooks/OpenCV - Pipeline Image Transformations.ipynb diff --git a/notebooks/samples/Regression - Flight Delays with DataCleaning.ipynb b/notebooks/Regression - Flight Delays with DataCleaning.ipynb similarity index 100% rename from notebooks/samples/Regression - Flight Delays with DataCleaning.ipynb rename to notebooks/Regression - Flight Delays with DataCleaning.ipynb diff --git a/notebooks/samples/Regression - Auto Imports.ipynb b/notebooks/Regression - Auto Imports.ipynb similarity index 100% rename from notebooks/samples/Regression - Auto Imports.ipynb rename to notebooks/Regression - Auto Imports.ipynb diff --git a/notebooks/samples/Regression - Flight Delays.ipynb b/notebooks/Regression - Flight Delays.ipynb similarity index 100% rename from notebooks/samples/Regression - Flight Delays.ipynb rename to notebooks/Regression - Flight Delays.ipynb diff --git a/notebooks/samples/Regression - Vowpal Wabbit vs. LightGBM vs. Linear Regressor.ipynb b/notebooks/Regression - Vowpal Wabbit vs. LightGBM vs. Linear Regressor.ipynb similarity index 100% rename from notebooks/samples/Regression - Vowpal Wabbit vs. LightGBM vs. Linear Regressor.ipynb rename to notebooks/Regression - Vowpal Wabbit vs. LightGBM vs. Linear Regressor.ipynb diff --git a/notebooks/samples/SparkServing - Deploying a Classifier.ipynb b/notebooks/SparkServing - Deploying a Classifier.ipynb similarity index 100% rename from notebooks/samples/SparkServing - Deploying a Classifier.ipynb rename to notebooks/SparkServing - Deploying a Classifier.ipynb diff --git a/notebooks/samples/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb b/notebooks/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb similarity index 100% rename from notebooks/samples/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb rename to notebooks/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb diff --git a/notebooks/samples/TextAnalytics - Amazon Book Reviews.ipynb b/notebooks/TextAnalytics - Amazon Book Reviews.ipynb similarity index 100% rename from notebooks/samples/TextAnalytics - Amazon Book Reviews.ipynb rename to notebooks/TextAnalytics - Amazon Book Reviews.ipynb diff --git a/notebooks/samples/Vowpal Wabbit - Overview.ipynb b/notebooks/Vowpal Wabbit - Overview.ipynb similarity index 100% rename from notebooks/samples/Vowpal Wabbit - Overview.ipynb rename to notebooks/Vowpal Wabbit - Overview.ipynb diff --git a/src/main/python/mmlspark/opencv/ImageTransformer.py b/opencv/src/main/python/mmlspark/opencv/ImageTransformer.py similarity index 100% rename from src/main/python/mmlspark/opencv/ImageTransformer.py rename to opencv/src/main/python/mmlspark/opencv/ImageTransformer.py diff --git a/src/test/python/mmlsparktest/cyber/__init__.py b/opencv/src/main/python/mmlspark/opencv/__init__.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/__init__.py rename to opencv/src/main/python/mmlspark/opencv/__init__.py diff --git a/src/main/scala/com/microsoft/ml/spark/image/ImageSetAugmenter.scala b/opencv/src/main/scala/com/microsoft/ml/spark/opencv/ImageSetAugmenter.scala similarity index 96% rename from src/main/scala/com/microsoft/ml/spark/image/ImageSetAugmenter.scala rename to opencv/src/main/scala/com/microsoft/ml/spark/opencv/ImageSetAugmenter.scala index d957e949630..ae89e80dd91 100644 --- a/src/main/scala/com/microsoft/ml/spark/image/ImageSetAugmenter.scala +++ b/opencv/src/main/scala/com/microsoft/ml/spark/opencv/ImageSetAugmenter.scala @@ -1,12 +1,11 @@ // Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. -package com.microsoft.ml.spark.image +package com.microsoft.ml.spark.opencv import com.microsoft.ml.spark.codegen.Wrappable import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol} import com.microsoft.ml.spark.logging.BasicLogging -import com.microsoft.ml.spark.opencv.{Flip, ImageTransformer} import org.apache.spark.ml._ import org.apache.spark.ml.image.ImageSchema import org.apache.spark.ml.param._ diff --git a/src/main/scala/com/microsoft/ml/spark/opencv/ImageTransformer.scala b/opencv/src/main/scala/com/microsoft/ml/spark/opencv/ImageTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/opencv/ImageTransformer.scala rename to opencv/src/main/scala/com/microsoft/ml/spark/opencv/ImageTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/opencv/OpenCVUtils.scala b/opencv/src/main/scala/com/microsoft/ml/spark/opencv/OpenCVUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/opencv/OpenCVUtils.scala rename to opencv/src/main/scala/com/microsoft/ml/spark/opencv/OpenCVUtils.scala diff --git a/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala b/opencv/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala similarity index 97% rename from src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala rename to opencv/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala index 5d05a243ccf..b20b309bb05 100644 --- a/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala +++ b/opencv/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala @@ -8,15 +8,15 @@ import java.net.URL import com.microsoft.ml.spark.core.env.FileUtilities import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} -import com.microsoft.ml.spark.opencv.{ImageTestUtils, ImageTransformer} +import com.microsoft.ml.spark.io.IOImplicits._ +import com.microsoft.ml.spark.opencv.{ImageTransformer, OpenCVTestUtils} +import org.apache.commons.io.FileUtils import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.{DataFrame, Row} -import com.microsoft.ml.spark.io.IOImplicits._ -import org.apache.commons.io.FileUtils class ResizeImageTransformerSuite extends TransformerFuzzing[ResizeImageTransformer] - with ImageTestUtils { + with OpenCVTestUtils { lazy val images: DataFrame = spark.read.image .option("dropInvalid", true).load(FileUtilities.join(fileLocation, "**").toString) diff --git a/src/test/scala/com/microsoft/ml/spark/image/ImageSetAugmenterSuite.scala b/opencv/src/test/scala/com/microsoft/ml/spark/opencv/ImageSetAugmenterSuite.scala similarity index 97% rename from src/test/scala/com/microsoft/ml/spark/image/ImageSetAugmenterSuite.scala rename to opencv/src/test/scala/com/microsoft/ml/spark/opencv/ImageSetAugmenterSuite.scala index 51993e8e955..427f84d08fb 100644 --- a/src/test/scala/com/microsoft/ml/spark/image/ImageSetAugmenterSuite.scala +++ b/opencv/src/test/scala/com/microsoft/ml/spark/opencv/ImageSetAugmenterSuite.scala @@ -1,7 +1,7 @@ // Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. -package com.microsoft.ml.spark.image +package com.microsoft.ml.spark.opencv import com.microsoft.ml.spark.build.BuildInfo import com.microsoft.ml.spark.core.test.base.LinuxOnly diff --git a/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala b/opencv/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala similarity index 98% rename from src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala rename to opencv/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala index 6c7ab6dfe53..62a43aa5e93 100644 --- a/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala +++ b/opencv/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala @@ -23,7 +23,7 @@ import org.opencv.imgproc.Imgproc import org.scalactic.Equality import org.scalatest.Assertion -trait ImageTestUtils { +trait OpenCVTestUtils { lazy protected val fileLocation = FileUtilities.join(BuildInfo.datasetDir, "Images", "Grocery") protected def selectTestImageBytes(images: DataFrame): Array[Byte] = { @@ -81,7 +81,7 @@ trait ImageTestUtils { } -class UnrollImageSuite extends TransformerFuzzing[UnrollImage] with ImageTestUtils with DataFrameEquality { +class UnrollImageSuite extends TransformerFuzzing[UnrollImage] with OpenCVTestUtils with DataFrameEquality { lazy val filesRoot = BuildInfo.datasetDir lazy val imagePath = FileUtilities.join(filesRoot,"Images", "CIFAR").toString @@ -128,7 +128,7 @@ class UnrollImageSuite extends TransformerFuzzing[UnrollImage] with ImageTestUti } class UnrollBinaryImageSuite extends TransformerFuzzing[UnrollBinaryImage] - with ImageTestUtils with DataFrameEquality { + with OpenCVTestUtils with DataFrameEquality { lazy val filesRoot = BuildInfo.datasetDir lazy val imagePath = FileUtilities.join(filesRoot, "Images", "CIFAR").toString @@ -163,7 +163,7 @@ class UnrollBinaryImageSuite extends TransformerFuzzing[UnrollBinaryImage] override def reader: UnrollBinaryImage.type = UnrollBinaryImage } -class ImageTransformerSuite extends TransformerFuzzing[ImageTransformer] with ImageTestUtils { +class ImageTransformerSuite extends TransformerFuzzing[ImageTransformer] with OpenCVTestUtils { //TODO this is needed to stop the build from freezing override def assertDFEq(df1: DataFrame, df2: DataFrame)(implicit eq: Equality[DataFrame]): Unit = { diff --git a/pipeline.yaml b/pipeline.yaml index 7a4eaf66ddf..eb25b5c40c4 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -33,7 +33,6 @@ jobs: pool: vmImage: ubuntu-18.04 steps: - - template: templates/ivy_cache.yml - task: AzureCLI@1 displayName: 'Style Check' inputs: @@ -46,7 +45,7 @@ jobs: pool: vmImage: ubuntu-18.04 steps: - - template: templates/ivy_cache_2.yml + #- template: templates/ivy_cache.yml - bash: echo "##vso[task.prependpath]$CONDA/bin" displayName: Add conda to PATH - bash: conda info @@ -87,7 +86,7 @@ jobs: pool: vmImage: ubuntu-18.04 steps: - - template: templates/ivy_cache_2.yml + #- template: templates/ivy_cache.yml - bash: echo "##vso[task.prependpath]$CONDA/bin" displayName: Add conda to PATH - bash: conda info @@ -128,14 +127,13 @@ jobs: pool: vmImage: ubuntu-18.04 steps: - - template: templates/ivy_cache.yml - task: AzureCLI@1 displayName: 'Get Docker Tag + Version' inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript inlineScript: | - VERSION=$(sbt version | tail -1 | cut -d' ' -f2 | sed 's/\x1b\[[0-9;]*m//g') + VERSION=$(sbt "core/version" | tail -1 | cut -d' ' -f2 | sed 's/\x1b\[[0-9;]*m//g') echo '##vso[task.setvariable variable=version]'$VERSION echo '##vso[task.setvariable variable=gittag]'$(git tag -l --points-at HEAD) - task: Docker@2 @@ -226,7 +224,7 @@ jobs: pool: vmImage: ubuntu-18.04 steps: - - template: templates/ivy_cache_2.yml + #- template: templates/ivy_cache.yml - bash: echo "##vso[task.prependpath]$CONDA/bin" displayName: Add conda to PATH - bash: conda env create -f environment.yaml @@ -275,7 +273,7 @@ jobs: pool: vmImage: ubuntu-18.04 steps: - - template: templates/ivy_cache_2.yml + #- template: templates/ivy_cache_2.yml - bash: echo "##vso[task.prependpath]$CONDA/bin" displayName: Add conda to PATH - bash: conda env create -f environment.yaml @@ -389,7 +387,7 @@ jobs: vw: PACKAGE: "vw" steps: - - template: templates/ivy_cache_2.yml + #- template: templates/ivy_cache.yml - task: AzureCLI@1 displayName: 'Setup repo' inputs: diff --git a/project/BlobMavenPlugin.scala b/project/BlobMavenPlugin.scala new file mode 100644 index 00000000000..de8114172e0 --- /dev/null +++ b/project/BlobMavenPlugin.scala @@ -0,0 +1,48 @@ +import java.io.File + +import BlobMavenPlugin.autoImport.publishBlob +import BuildUtils.{join, uploadToBlob} +import sbt._ +import Keys._ +import org.apache.ivy.core.IvyPatternHelper + +//noinspection ScalaStyle +object BlobMavenPlugin extends AutoPlugin { + override def trigger = allRequirements + + object autoImport { + val publishBlob = TaskKey[Unit]("publishBlob", "publish the library to mmlspark blob") + val blobArtifactInfo = SettingKey[String]("blobArtifactInfo") + } + + import autoImport._ + + override def requires: Plugins = sbt.Plugins.empty + + override lazy val projectSettings: Seq[Setting[_]] = Seq( + publishBlob := { + publishM2.value + //TODO make this more general - 1.0 is a hack and not sure of a way to get this with sbt keys + val sourceArtifactName = s"${moduleName.value}_${scalaBinaryVersion.value}_1.0" + val destArtifactName = s"${moduleName.value}" + val repositoryDir = new File(new URI(Resolver.mavenLocal.root)) + val orgDirs = organization.value.split(".".toCharArray.head) + val localPackageFolder = join(repositoryDir, orgDirs ++ Seq(sourceArtifactName, version.value):_*).toString + val blobMavenFolder = (orgDirs ++ Seq(destArtifactName, version.value)).mkString("/") + uploadToBlob(localPackageFolder, blobMavenFolder, "maven") + println(blobArtifactInfo.value) + }, + blobArtifactInfo := { + s""" + |MMLSpark Build and Release Information + |--------------- + | + |### Maven Coordinates + | `${organization.value}:${moduleName.value}:${version.value}` + | + |### Maven Resolver + | `https://mmlspark.azureedge.net/maven` + |""".stripMargin + } + ) +} \ No newline at end of file diff --git a/project/CodegenPlugin.scala b/project/CodegenPlugin.scala new file mode 100644 index 00000000000..0c660663f93 --- /dev/null +++ b/project/CodegenPlugin.scala @@ -0,0 +1,245 @@ +import java.io.File + +import BuildUtils.{join, runCmd, singleUploadToBlob, zipFolder} +import CondaPlugin.autoImport.{activateCondaEnv, condaEnvLocation, createCondaEnvTask} +import org.apache.commons.io.FileUtils +import sbt.Keys._ +import sbt.{Def, _} +import spray.json._ + +object CodegenConfigProtocol extends DefaultJsonProtocol { + implicit val CCFormat: RootJsonFormat[CodegenConfig] = jsonFormat8(CodegenConfig.apply) +} + +import CodegenConfigProtocol._ + +case class CodegenConfig(name: String, + jarName: Option[String], + topDir: String, + targetDir: String, + version: String, + pythonizedVersion: String, + rVersion: String, + packageName: String) + +//noinspection ScalaStyle +object CodegenPlugin extends AutoPlugin { + override def trigger = allRequirements + + override def requires: Plugins = CondaPlugin + + def rCmd(activateCondaEnv: Seq[String], cmd: Seq[String], wd: File, libPath: String): Unit = { + runCmd(activateCondaEnv ++ cmd, wd, Map("R_LIBS" -> libPath, "R_USER_LIBS" -> libPath)) + } + + val RInstall = Tags.Tag("rInstall") + + object autoImport { + val pythonizedVersion = settingKey[String]("Pythonized version") + val rVersion = settingKey[String]("R version") + val genPackageNamespace = settingKey[String]("genPackageNamespace") + val genTestPackageNamespace = settingKey[String]("genTestPackageNamespace") + val codegenJarName = settingKey[Option[String]]("codegenJarName") + val testgenJarName = settingKey[Option[String]]("testgenJarName") + val codegenArgs = settingKey[String]("codegenArgs") + val testgenArgs = settingKey[String]("testgenArgs") + + + val targetDir = settingKey[File]("targetDir") + val codegenDir = settingKey[File]("codegenDir") + + val codegen = TaskKey[Unit]("codegen", "Generate Code") + val testgen = TaskKey[Unit]("testgen", "Generate Tests") + + val packageR = TaskKey[Unit]("packageR", "Generate roxygen docs and zip R package") + val publishR = TaskKey[Unit]("publishR", "publish R package to blob") + val testR = TaskKey[Unit]("testR", "Run testthat on R tests") + + val packagePython = TaskKey[Unit]("packagePython", "Package python sdk") + val installPipPackage = TaskKey[Unit]("installPipPackage", "install python sdk") + val publishPython = TaskKey[Unit]("publishPython", "publish python wheel") + val testPython = TaskKey[Unit]("testPython", "test python sdk") + + val mergePyCodeDir = SettingKey[File]("mergePyCodeDir") + val mergePyCode = TaskKey[Unit]("mergePyCode", "copy python code to a destination") + } + + import autoImport._ + + override lazy val globalSettings: Seq[Setting[_]] = Seq( + Global / concurrentRestrictions += Tags.limit(RInstall, 1) + ) + + def testRImpl: Def.Initialize[Task[Unit]] = Def.task { + packageR.value + publishLocal.value + val libPath = join(condaEnvLocation.value, "Lib", "R", "library").toString + val rSrcDir = join(codegenDir.value, "src", "R", genPackageNamespace.value) + rCmd(activateCondaEnv.value, + Seq("R", "CMD", "INSTALL", "--no-multiarch", "--with-keep.source", genPackageNamespace.value), + rSrcDir.getParentFile, libPath) + val testRunner = join("tools", "tests", "run_r_tests.R") + if (join(rSrcDir,"tests").exists()){ + rCmd(activateCondaEnv.value, + Seq("Rscript", testRunner.getAbsolutePath), rSrcDir, libPath) + } + } tag(RInstall) + + + override lazy val projectSettings: Seq[Setting[_]] = Seq( + publishMavenStyle := true, + codegenArgs := { + CodegenConfig( + name.value, + codegenJarName.value, + baseDirectory.value.getAbsolutePath, + targetDir.value.getAbsolutePath, + version.value, + pythonizedVersion.value, + rVersion.value, + genPackageNamespace.value + ).toJson.compactPrint + }, + testgenArgs := { + CodegenConfig( + name.value, + testgenJarName.value, + baseDirectory.value.getAbsolutePath, + targetDir.value.getAbsolutePath, + version.value, + pythonizedVersion.value, + rVersion.value, + genPackageNamespace.value + ).toJson.compactPrint + }, + codegenJarName := { + val art: Artifact = (Compile / packageBin / artifact).value + Some(artifactName.value( + ScalaVersion(scalaVersion.value, scalaBinaryVersion.value), + projectID.value, + art)) + }, + testgenJarName := { + val art: Artifact = (Test / packageBin / artifact).value + Some(artifactName.value( + ScalaVersion(scalaVersion.value, scalaBinaryVersion.value), + projectID.value, + art)) + }, + codegen := (Def.taskDyn { + (Compile / compile).value + (Test / compile).value + val arg = codegenArgs.value + Def.task { + (Compile / runMain).toTask(s" com.microsoft.ml.spark.codegen.CodeGen $arg").value + } + }.value), + testgen := (Def.taskDyn { + (Compile / compile).value + (Test / compile).value + val arg = testgenArgs.value + Def.task { + (Test / runMain).toTask(s" com.microsoft.ml.spark.codegen.TestGen $arg").value + } + }.value), + pythonizedVersion := { + if (version.value.contains("-")) { + version.value.split("-".head).head + ".dev1" + } else { + version.value + } + }, + rVersion := { + if (version.value.contains("-")) { + version.value.split("-".head).head + } else { + version.value + } + }, + packageR := { + createCondaEnvTask.value + codegen.value + val rSrcDir = join(codegenDir.value, "src", "R", genPackageNamespace.value) + val rPackageDir = join(codegenDir.value, "package", "R") + val libPath = join(condaEnvLocation.value, "Lib", "R", "library").toString + rCmd(activateCondaEnv.value, Seq("R", "-q", "-e", "roxygen2::roxygenise()"), rSrcDir, libPath) + rPackageDir.mkdirs() + zipFolder(rSrcDir, new File(rPackageDir, s"${name.value}-${version.value}.zip")) + }, + testR := testRImpl.value, + publishR := { + codegen.value + packageR.value + val rPackageDir = join(codegenDir.value, "package", "R") + val rPackage = rPackageDir.listFiles().head + singleUploadToBlob(rPackage.toString, rPackage.getName, "rrr") + }, + packagePython := { + codegen.value + createCondaEnvTask.value + val destPyDir = join(targetDir.value, "classes", genPackageNamespace.value) + val packageDir = join(codegenDir.value, "package", "python").absolutePath + val pythonSrcDir = join(codegenDir.value, "src", "python") + if (destPyDir.exists()) FileUtils.forceDelete(destPyDir) + val sourcePyDir = join(pythonSrcDir.getAbsolutePath, genPackageNamespace.value) + FileUtils.copyDirectory(sourcePyDir, destPyDir) + runCmd( + activateCondaEnv.value ++ + Seq(s"python", "setup.py", "bdist_wheel", "--universal", "-d", packageDir), + pythonSrcDir) + }, + installPipPackage := { + packagePython.value + publishLocal.value + runCmd( + activateCondaEnv.value ++ Seq("pip", "install", "-I", + s"${name.value.replace("-", "_")}-${pythonizedVersion.value}-py2.py3-none-any.whl"), + join(codegenDir.value, "package", "python")) + }, + publishPython := { + publishLocal.value + packagePython.value + val fn = s"${name.value.replace("-", "_")}-${pythonizedVersion.value}-py2.py3-none-any.whl" + singleUploadToBlob( + join(codegenDir.value, "package", "python", fn).toString, + version.value + "/" + fn, "pip") + }, + mergePyCode := { + val srcDir = join(codegenDir.value, "src", "python", genPackageNamespace.value) + val destDir = join(mergePyCodeDir.value, "src", "python", genPackageNamespace.value) + FileUtils.copyDirectory(srcDir, destDir) + }, + testPython := { + installPipPackage.value + testgen.value + val mainTargetDir = join(baseDirectory.value.getParent, "target") + runCmd( + activateCondaEnv.value ++ Seq("python", + "-m", + "pytest", + s"--cov=${genPackageNamespace.value}", + s"--junitxml=${join(mainTargetDir, s"python-test-results-${name.value}.xml")}", + "--cov-report=xml", + genTestPackageNamespace.value + ), + new File(codegenDir.value, "test/python/") + ) + }, + targetDir := { + artifactPath.in(packageBin).in(Compile).value.getParentFile + }, + mergePyCodeDir := { + join(baseDirectory.value.getParent, "target", "scala-2.12", "sbt-1.0", "generated") + }, + codegenDir := { + join(targetDir.value, "generated") + }, + genPackageNamespace := { + "mmlspark" + }, + genTestPackageNamespace := { + "mmlsparktest" + } + + ) +} \ No newline at end of file diff --git a/project/CondaPlugin.scala b/project/CondaPlugin.scala new file mode 100644 index 00000000000..4e3e3ce005b --- /dev/null +++ b/project/CondaPlugin.scala @@ -0,0 +1,56 @@ +import BuildUtils.{osPrefix, runCmd} +import sbt._ +import Keys._ + +import scala.sys.process.Process + +//noinspection ScalaStyle +object CondaPlugin extends AutoPlugin { + override def trigger = allRequirements + + object autoImport { + val condaEnvName = settingKey[String]("Name of conda environment") + val cleanCondaEnvTask = TaskKey[Unit]("cleanCondaEnv", "create conda env") + val condaEnvLocation = TaskKey[File]("condaEnvLocation", "get install location of conda env") + val createCondaEnvTask = TaskKey[Unit]("createCondaEnv", "create conda env") + val activateCondaEnv = settingKey[Seq[String]]("commands to activate conda environment") + } + + import autoImport._ + override lazy val globalSettings: Seq[Setting[_]] = Seq( + condaEnvName := "mmlspark", + cleanCondaEnvTask := { + runCmd(Seq("conda", "env", "remove", "--name", condaEnvName.value, "-y")) + }, + condaEnvLocation := { + createCondaEnvTask.value + new File(Process("conda env list").lineStream.toList + .map(_.split("\\s+")) + .map(l => (l.head, l.reverse.head)) + .filter(p => p._1 == condaEnvName.value) + .head._2) + }, + createCondaEnvTask := { + val hasEnv = Process("conda env list").lineStream.toList + .map(_.split("\\s+").head).contains(condaEnvName.value) + if (!hasEnv) { + runCmd(Seq("conda", "env", "create", "-f", "environment.yaml")) + } else { + println("Found conda env " + condaEnvName.value) + } + }, + activateCondaEnv := { + if (sys.props("os.name").toLowerCase.contains("windows")) { + osPrefix ++ Seq("activate", condaEnvName.value, "&&") + } else { + Seq() + //TODO figure out why this doesent work + //Seq("/bin/bash", "-l", "-c", "source activate " + condaEnvName, "&&") + } + } + ) + + override def requires: Plugins = sbt.Plugins.empty + + override lazy val projectSettings: Seq[Setting[_]] = Seq() +} \ No newline at end of file diff --git a/project/build.scala b/project/build.scala index f7816cd5d48..06a930e33d1 100644 --- a/project/build.scala +++ b/project/build.scala @@ -2,8 +2,12 @@ import java.io.File import java.lang.ProcessBuilder.Redirect object BuildUtils { + def join(root: File, folders: String*): File = { + folders.foldLeft(root) { case (f, s) => new File(f, s) } + } + def join(folders: String*): File = { - folders.tail.foldLeft(new File(folders.head)) { case (f, s) => new File(f, s) } + join(new File(folders.head), folders.tail: _*) } def isWindows: Boolean = { @@ -27,7 +31,7 @@ object BuildUtils { .redirectError(Redirect.INHERIT) .redirectOutput(Redirect.INHERIT) val env = pb.environment() - envVars.foreach(p =>env.put(p._1,p._2)) + envVars.foreach(p => env.put(p._1, p._2)) assert(pb.start().waitFor() == 0) } @@ -56,6 +60,7 @@ object BuildUtils { "--account-key", Secrets.storageKey) runCmd(osPrefix ++ command) } + def singleUploadToBlob(source: String, dest: String, container: String, @@ -76,6 +81,7 @@ object BuildUtils { val (dirs, files) = dir.listFiles.sorted.partition(_.isDirectory) (if (pred == null) files else files.filter(pred)) ++ dirs.flatMap(loop) } + loop(dir) } @@ -91,7 +97,9 @@ object BuildUtils { zip.putNextEntry(new ZipEntry(file.toString.substring(prefixLen).replace(java.io.File.separator, "/"))) val in = new BufferedInputStream(new FileInputStream(file), bufferSize) var b = 0 - while (b >= 0) { zip.write(data, 0, b); b = in.read(data, 0, bufferSize) } + while (b >= 0) { + zip.write(data, 0, b); b = in.read(data, 0, bufferSize) + } in.close() zip.closeEntry() } diff --git a/project/plugins.sbt b/project/plugins.sbt index cc082cf59b0..6f4bd427f23 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -4,4 +4,4 @@ addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.9.0") addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.8") addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.1") addSbtPlugin("com.dwijnand" % "sbt-dynver" % "4.0.0") -addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.0") \ No newline at end of file +addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.0") diff --git a/src/main/python/setup.py b/src/main/python/setup.py deleted file mode 100644 index 3ba8474be22..00000000000 --- a/src/main/python/setup.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (C) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. See LICENSE in project root for information. - -import os -from setuptools import setup, find_packages -import codecs -import os.path - - -def read(rel_path): - here = os.path.abspath(os.path.dirname(__file__)) - with codecs.open(os.path.join(here, rel_path), "r") as fp: - return fp.read() - - -def get_version(rel_path): - for line in read(rel_path).splitlines(): - if line.startswith("__version__"): - delim = '"' if '"' in line else "'" - return line.split(delim)[1] - return "0.0.0" - - -setup( - name="mmlspark", - version=get_version("mmlspark/__init__.py"), - description="Microsoft ML for Spark", - long_description="Microsoft ML for Apache Spark contains Microsoft's open source " - + "contributions to the Apache Spark ecosystem", - license="MIT", - packages=find_packages(), - url="https://github.com/Azure/mmlspark", - author="Microsoft", - author_email="mmlspark-support@microsoft.com", - classifiers=[ - "Development Status :: 3 - Alpha", - "Intended Audience :: Developers", - "Intended Audience :: Data Scientists", - "Topic :: Software Development :: Datascience Tools", - "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 2", - "Programming Language :: Python :: 3", - ], - zip_safe=True, - package_data={"mmlspark": ["../LICENSE.txt", "../README.txt"]}, -) diff --git a/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala b/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala deleted file mode 100644 index 03785cbd8c9..00000000000 --- a/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.ml.spark.codegen - -import java.io.File - -import com.microsoft.ml.spark.build.BuildInfo - -object Config { - val DebugMode = sys.env.getOrElse("DEBUGMODE", "").trim.toLowerCase == "true" - - val TopDir = BuildInfo.baseDirectory - val Version = BuildInfo.version - val PackageName = BuildInfo.name - val TargetDir = new File(TopDir, s"target/scala-${BuildInfo.scalaVersion.slice(0,4)}") - val ScalaSrcDir = "src/main/scala" - - val GeneratedDir = new File(TargetDir, "generated") - val PackageDir = new File(GeneratedDir, "package") - val SrcDir = new File(GeneratedDir, "src") - val TestDir = new File(GeneratedDir, "test") - val DocDir = new File(GeneratedDir, "doc") - val TestDataDir = new File(GeneratedDir, "test-data") - - //Python Codegen Constant - val PySrcDir = new File(SrcDir, "python") - val PyPackageDir = new File(PackageDir, "python") - val PyTestDir = new File(TestDir, "python") - val PySrcOverrideDir = new File(TopDir, "src/main/python") - val PyTestOverrideDir = new File(TopDir, "src/test/python") - - //R Codegen Constants - val RSrcRoot = new File(SrcDir, "R") - val RSrcDir = new File(RSrcRoot, "mmlspark/R") - val RPackageDir = new File(PackageDir, "R") - val RTestDir = new File(RSrcRoot, "mmlspark/tests") - - val RTestOverrideDir = new File(TopDir, "src/test/R") - val RSrcOverrideDir = new File(TopDir, "src/main/R") - - //val rPackageFile = new File(rPackageDir, s"mmlspark-$mmlVer.zip") - - val InternalPrefix = "_" - val ScopeDepth = " " * 4 - - val CopyrightLines = - s"""|# Copyright (C) Microsoft Corporation. All rights reserved. - |# Licensed under the MIT License. See LICENSE in project root for information. - |""".stripMargin - - // The __init__.py file - def packageHelp(importString: String): String = { - s"""|$CopyrightLines - | - |"\"" - |MicrosoftML is a library of Python classes to interface with the - |Microsoft scala APIs to utilize Apache Spark to create distibuted - |machine learning models. - | - |MicrosoftML simplifies training and scoring classifiers and - |regressors, as well as facilitating the creation of models using the - |CNTK library, images, and text. - |"\"" - | - |__version__ = "${BuildInfo.pythonizedVersion}" - |__spark_package_version__ = "${BuildInfo.version}" - | - |$importString - |""".stripMargin - } -} diff --git a/src/test/python/mmlsparktest/cyber/utils/__init__.py b/src/test/python/mmlsparktest/cyber/utils/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/src/test/python/mmlsparktest/nn/__init__.py b/src/test/python/mmlsparktest/nn/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/src/test/python/mmlsparktest/recommendation/__init__.py b/src/test/python/mmlsparktest/recommendation/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/src/test/python/mmlsparktest/spark.py b/src/test/python/mmlsparktest/spark.py deleted file mode 100644 index 6100bdf6cd7..00000000000 --- a/src/test/python/mmlsparktest/spark.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (C) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. See LICENSE in project root for information. - -from pyspark.sql import SparkSession, SQLContext -import os -import mmlspark - -spark = SparkSession.builder \ - .master("local[*]") \ - .appName("PysparkTests") \ - .config("spark.jars.packages", "com.microsoft.ml.spark:mmlspark_2.12:" + mmlspark.__spark_package_version__) \ - .config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven") \ - .config("spark.executor.heartbeatInterval", "60s") \ - .config("spark.sql.shuffle.partitions", 10) \ - .config("spark.sql.crossJoin.enabled", "true") \ - .getOrCreate() - -sc = SQLContext(spark.sparkContext) diff --git a/src/test/python/mmlsparktest/vw/__init__.py b/src/test/python/mmlsparktest/vw/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKTestUtils.scala b/src/test/scala/com/microsoft/ml/spark/cntk/CNTKTestUtils.scala deleted file mode 100644 index 4981013301c..00000000000 --- a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKTestUtils.scala +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.ml.spark.cntk - -import java.io.File - -import com.microsoft.ml.spark.build.BuildInfo -import com.microsoft.ml.spark.core.env.FileUtilities -import com.microsoft.ml.spark.core.test.base.TestBase -import com.microsoft.ml.spark.image.UnrollImage -import org.apache.spark.ml.linalg.DenseVector -import org.apache.spark.sql._ -import com.microsoft.ml.spark.io.IOImplicits._ - -trait CNTKTestUtils extends TestBase { - - val filesRoot = BuildInfo.datasetDir.toString - val imagePath = FileUtilities.join(filesRoot, "Images", "CIFAR").toString - val modelPath = FileUtilities.join(filesRoot, "CNTKModel", "ConvNet_CIFAR10.model").toString - val inputCol = "cntk_images" - val outputCol = "out" - val labelCol = "labels" - - val featureVectorLength = 3 * 32 * 32 - lazy val saveFile = new File(tmpDir.toFile, "spark-z.model").toString - - def testModelDF(spark: SparkSession): DataFrame = { - import spark.implicits._ - spark.sparkContext.parallelize(Seq( - Array(1.32165250, -2.1215112, 0.63150704, 0.77315974, -1.28163720, - -0.20210080, -2.2839167, -2.08691480, 5.08418200, -1.33741090), - Array(3.44079640, 1.4877119, -0.74059330, -0.34381202, -2.48724990, - -2.62866950, -3.1693816, -3.14182600, 4.76314800, 0.68712880), - Array(-1.88747900, -4.7685330, 0.15169683, 6.80547570, -0.38405967, - 3.41065170, 1.3302778, -0.87714905, -2.18046050, -4.16661830), - Array(5.01010300, 3.9860306, -1.36795600, -0.89830830, -4.49545430, - -4.19537070, -4.4045380, -5.81759450, 6.93805700, 1.49001510), - Array(-4.70754600, -6.0414960, 1.20658250, 5.40738300, 1.07661690, - 4.71566440, 4.3834330, -1.57187440, -2.96569730, -5.43208270), - Array(-1.23873880, -3.2042341, 2.54533000, 5.51954800, 2.89042470, - 0.12380804, 3.8639085, -4.79466800, -2.41463420, -5.17418430))).toDF - } - - def testImages(spark: SparkSession): DataFrame = { - val images = spark.read.image.load(imagePath) - - val unroll = new UnrollImage().setInputCol("image").setOutputCol(inputCol) - - unroll.transform(images).select(inputCol) - } - - def makeFakeData(spark: SparkSession, rows: Int, size: Int, outputDouble: Boolean = false): DataFrame = { - import spark.implicits._ - if (outputDouble) { - List - .fill(rows)(List.fill(size)(0.0).toArray) - .zip(List.fill(rows)(0.0)) - .toDF(inputCol, labelCol) - } else { - List - .fill(rows)(List.fill(size)(0.0.toFloat).toArray) - .zip(List.fill(rows)(0.0)) - .toDF(inputCol, labelCol) - } - } - - protected def compareToTestModel(result: DataFrame) = { - //TODO improve checks - assert(result.columns.toSet == Set(inputCol, outputCol)) - assert(result.count() == testModelDF(result.sparkSession).count()) - val max = result - .select(outputCol) - .collect() - .map(row => row.getAs[DenseVector](0).toArray.max) - .max - assert(max < 10 & max > -10) - } - -} diff --git a/src/test/scala/com/microsoft/ml/spark/codegen/CodeGen.scala b/src/test/scala/com/microsoft/ml/spark/codegen/CodeGen.scala deleted file mode 100644 index 67d667e339e..00000000000 --- a/src/test/scala/com/microsoft/ml/spark/codegen/CodeGen.scala +++ /dev/null @@ -1,176 +0,0 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.ml.spark.codegen - -import java.io.File -import com.microsoft.ml.spark.build.BuildInfo -import com.microsoft.ml.spark.codegen.Config._ -import com.microsoft.ml.spark.core.env.FileUtilities._ -import com.microsoft.ml.spark.core.test.base.TestBase -import com.microsoft.ml.spark.core.test.fuzzing.PyTestFuzzing -import com.microsoft.ml.spark.core.utils.JarLoadingUtils.instantiateServices -import org.apache.commons.io.FileUtils -import org.apache.commons.io.FilenameUtils._ - -object CodeGenUtils { - def clean(dir: File): Unit = if (dir.exists()) FileUtils.forceDelete(dir) - - def toDir(f: File): File = new File(f, File.separator) -} - -object CodeGen { - - import CodeGenUtils._ - - def generatePythonClasses(): Unit = { - instantiateServices[PythonWrappable].foreach { w => - w.makePyFile() - } - } - - def generateRClasses(): Unit = { - instantiateServices[RWrappable].foreach { w => - w.makeRFile() - } - } - - private def makeInitFiles(packageFolder: String = ""): Unit = { - val dir = new File(new File(PySrcDir, "mmlspark"), packageFolder) - val packageString = if (packageFolder != "") packageFolder.replace("/", ".") else "" - val importStrings = - dir.listFiles.filter(_.isFile).sorted - .map(_.getName) - .filter(name => name.endsWith(".py") && !name.startsWith("_") && !name.startsWith("test")) - .map(name => s"from mmlspark$packageString.${getBaseName(name)} import *\n").mkString("") - writeFile(new File(dir, "__init__.py"), packageHelp(importStrings)) - dir.listFiles().filter(_.isDirectory).foreach(f => - makeInitFiles(packageFolder + "/" + f.getName) - ) - } - - //noinspection ScalaStyle - def generateRPackageData(): Unit = { - // description file; need to encode version as decimal - val today = new java.text.SimpleDateFormat("yyyy-MM-dd") - .format(new java.util.Date()) - - RSrcDir.mkdirs() - writeFile(new File(RSrcDir.getParentFile, "DESCRIPTION"), - s"""|Package: mmlspark - |Title: Access to MMLSpark via R - |Description: Provides an interface to MMLSpark. - |Version: ${BuildInfo.rVersion} - |Date: $today - |Author: Microsoft Corporation - |Maintainer: MMLSpark Team - |URL: https://github.com/Azure/mmlspark - |BugReports: https://github.com/Azure/mmlspark/issues - |Depends: - | R (>= 2.12.0) - |Imports: - | sparklyr - |License: MIT - |Suggests: - | testthat (>= 3.0.0) - |Config/testthat/edition: 3 - |""".stripMargin) - - writeFile(new File(RSrcDir, "package_register.R"), - s"""|#' @import sparklyr - |spark_dependencies <- function(spark_version, scala_version, ...) { - | spark_dependency( - | jars = c(), - | packages = c( - | sprintf("com.microsoft.ml.spark:mmlspark_%s:${BuildInfo.version}", scala_version) - | ), - | repositories = c("https://mmlspark.azureedge.net/maven") - | ) - |} - | - |#' @import sparklyr - |.onLoad <- function(libname, pkgname) { - | sparklyr::register_extension(pkgname) - |} - |""".stripMargin) - - writeFile(new File(RSrcDir.getParentFile, "mmlspark.Rproj"), - """ - |Version: 1.0 - | - |RestoreWorkspace: Default - |SaveWorkspace: Default - |AlwaysSaveHistory: Default - | - |EnableCodeIndexing: Yes - |UseSpacesForTab: Yes - |NumSpacesForTab: 4 - |Encoding: UTF-8 - | - |RnwWeave: Sweave - |LaTeX: pdfLaTeX - | - |BuildType: Package - |PackageUseDevtools: Yes - |PackageInstallArgs: --no-multiarch --with-keep.source - | - |""".stripMargin) - - } - - def rGen(): Unit = { - clean(RSrcRoot) - generateRPackageData() - generateRClasses() - FileUtils.copyDirectoryToDirectory(toDir(RSrcOverrideDir), toDir(RSrcDir)) - FileUtils.copyDirectoryToDirectory(toDir(RTestOverrideDir), toDir(RTestDir)) - } - - def pyGen(): Unit = { - clean(PySrcDir) - generatePythonClasses() - TestBase.stopSparkSession() - FileUtils.copyDirectoryToDirectory(toDir(PySrcOverrideDir), toDir(PySrcDir)) - makeInitFiles() - } - - def main(args: Array[String]): Unit = { - clean(PackageDir) - rGen() - pyGen() - } - -} - -object TestGen { - - import CodeGenUtils._ - - def generatePythonTests(): Unit = { - instantiateServices[PyTestFuzzing[_]].foreach { ltc => - try { - ltc.makePyTestFile() - } catch { - case _: NotImplementedError => - println(s"ERROR: Could not generate test for ${ltc.testClassName} because of Complex Parameters") - } - } - } - - private def makeInitFiles(packageFolder: String = ""): Unit = { - val dir = new File(new File(PyTestDir, "mmlsparktest"), packageFolder) - writeFile(new File(dir, "__init__.py"), "") - dir.listFiles().filter(_.isDirectory).foreach(f => - makeInitFiles(packageFolder + "/" + f.getName) - ) - } - - def main(args: Array[String]): Unit = { - clean(TestDataDir) - clean(PyTestDir) - generatePythonTests() - TestBase.stopSparkSession() - FileUtils.copyDirectoryToDirectory(toDir(PyTestOverrideDir), toDir(PyTestDir)) - makeInitFiles() - } -} diff --git a/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/FuzzingTest.scala b/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/FuzzingTest.scala index ebc1f584a0c..590ce904417 100644 --- a/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/FuzzingTest.scala +++ b/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/FuzzingTest.scala @@ -3,6 +3,8 @@ package com.microsoft.ml.spark.core.test.fuzzing +import java.lang.reflect.ParameterizedType + import com.microsoft.ml.spark.core.contracts.{HasFeaturesCol, HasInputCol, HasLabelCol, HasOutputCol} import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.core.utils.JarLoadingUtils @@ -10,7 +12,6 @@ import org.apache.spark.ml._ import org.apache.spark.ml.param._ import org.apache.spark.ml.util.{MLReadable, MLWritable} -import java.lang.reflect.ParameterizedType import scala.language.existentials /** Tests to validate fuzzing of modules. */ @@ -261,17 +262,17 @@ class FuzzingTest extends TestBase { // set the context loader to pick up on the jars //Thread.currentThread().setContextClassLoader(JarLoadingUtils.classLoader) - private lazy val readers: List[MLReadable[_]] = JarLoadingUtils.instantiateObjects[MLReadable[_]] + private lazy val readers: List[MLReadable[_]] = JarLoadingUtils.instantiateObjects[MLReadable[_]]() - private lazy val pipelineStages: List[PipelineStage] = JarLoadingUtils.instantiateServices[PipelineStage] + private lazy val pipelineStages: List[PipelineStage] = JarLoadingUtils.instantiateServices[PipelineStage]() private lazy val experimentFuzzers: List[ExperimentFuzzing[_ <: PipelineStage]] = - JarLoadingUtils.instantiateServices[ExperimentFuzzing[_ <: PipelineStage]] + JarLoadingUtils.instantiateServices[ExperimentFuzzing[_ <: PipelineStage]]() private lazy val serializationFuzzers: List[SerializationFuzzing[_ <: PipelineStage with MLWritable]] = - JarLoadingUtils.instantiateServices[SerializationFuzzing[_ <: PipelineStage with MLWritable]] + JarLoadingUtils.instantiateServices[SerializationFuzzing[_ <: PipelineStage with MLWritable]]() private lazy val pytestFuzzers: List[PyTestFuzzing[_ <: PipelineStage]] = - JarLoadingUtils.instantiateServices[PyTestFuzzing[_ <: PipelineStage]] + JarLoadingUtils.instantiateServices[PyTestFuzzing[_ <: PipelineStage]]() } diff --git a/templates/ivy_cache_2.yml b/templates/ivy_cache_2.yml deleted file mode 100644 index 3da8290990c..00000000000 --- a/templates/ivy_cache_2.yml +++ /dev/null @@ -1,9 +0,0 @@ -steps: - - task: Cache@2 - inputs: - key: 'ivy3 | "$(Agent.OS)" | **/build.sbt' - restoreKeys: | - ivy2 | "$(Agent.OS)" - ivy2 - path: $(Pipeline.Workspace)/../../.ivy2/cache - displayName: Cache local ivy repo \ No newline at end of file diff --git a/tools/docker/demo/Dockerfile b/tools/docker/demo/Dockerfile index e35714aded3..44eb29299b0 100644 --- a/tools/docker/demo/Dockerfile +++ b/tools/docker/demo/Dockerfile @@ -30,7 +30,7 @@ RUN conda install jupyter \ && conda clean --all --yes COPY tools/docker/demo/init_notebook.py /root/.ipython/profile_default/startup -COPY notebooks/samples notebooks +COPY notebooks notebooks WORKDIR notebooks ARG MMLSPARK_VERSION diff --git a/tools/tests/run_r_tests.R b/tools/tests/run_r_tests.R index 99daad5ba4b..847ea3ea46b 100644 --- a/tools/tests/run_r_tests.R +++ b/tools/tests/run_r_tests.R @@ -3,7 +3,7 @@ tryCatch({ spark_install_find(version = "3.1.2") }, error=function(err) { - spark_install_tar("../../../../../../spark-3.1.2-bin-hadoop3.2.tgz") + spark_install_tar("../../../../../../../../spark-3.1.2-bin-hadoop3.2.tgz") } ) diff --git a/src/main/python/mmlspark/vw/VowpalWabbitClassificationModel.py b/vw/src/main/python/mmlspark/vw/VowpalWabbitClassificationModel.py similarity index 100% rename from src/main/python/mmlspark/vw/VowpalWabbitClassificationModel.py rename to vw/src/main/python/mmlspark/vw/VowpalWabbitClassificationModel.py diff --git a/src/main/python/mmlspark/vw/VowpalWabbitClassifier.py b/vw/src/main/python/mmlspark/vw/VowpalWabbitClassifier.py similarity index 97% rename from src/main/python/mmlspark/vw/VowpalWabbitClassifier.py rename to vw/src/main/python/mmlspark/vw/VowpalWabbitClassifier.py index ba9d72dc1ee..ac33082148c 100644 --- a/src/main/python/mmlspark/vw/VowpalWabbitClassifier.py +++ b/vw/src/main/python/mmlspark/vw/VowpalWabbitClassifier.py @@ -1,14 +1,14 @@ -# Copyright (C) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. See LICENSE in project root for information. - -from mmlspark.vw._VowpalWabbitClassifier import _VowpalWabbitClassifier -from pyspark.ml.common import inherit_doc - -@inherit_doc -class VowpalWabbitClassifier(_VowpalWabbitClassifier): - - def setInitialModel(self, model): - """ - Initialize the estimator with a previously trained model. - """ - self._java_obj.setInitialModel(model._java_obj.getModel()) +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +from mmlspark.vw._VowpalWabbitClassifier import _VowpalWabbitClassifier +from pyspark.ml.common import inherit_doc + +@inherit_doc +class VowpalWabbitClassifier(_VowpalWabbitClassifier): + + def setInitialModel(self, model): + """ + Initialize the estimator with a previously trained model. + """ + self._java_obj.setInitialModel(model._java_obj.getModel()) diff --git a/src/main/python/mmlspark/vw/VowpalWabbitContextualBandit.py b/vw/src/main/python/mmlspark/vw/VowpalWabbitContextualBandit.py similarity index 100% rename from src/main/python/mmlspark/vw/VowpalWabbitContextualBandit.py rename to vw/src/main/python/mmlspark/vw/VowpalWabbitContextualBandit.py diff --git a/src/main/python/mmlspark/vw/VowpalWabbitContextualBanditModel.py b/vw/src/main/python/mmlspark/vw/VowpalWabbitContextualBanditModel.py similarity index 100% rename from src/main/python/mmlspark/vw/VowpalWabbitContextualBanditModel.py rename to vw/src/main/python/mmlspark/vw/VowpalWabbitContextualBanditModel.py diff --git a/src/main/python/mmlspark/vw/VowpalWabbitRegressionModel.py b/vw/src/main/python/mmlspark/vw/VowpalWabbitRegressionModel.py similarity index 100% rename from src/main/python/mmlspark/vw/VowpalWabbitRegressionModel.py rename to vw/src/main/python/mmlspark/vw/VowpalWabbitRegressionModel.py diff --git a/src/main/python/mmlspark/vw/VowpalWabbitRegressor.py b/vw/src/main/python/mmlspark/vw/VowpalWabbitRegressor.py similarity index 100% rename from src/main/python/mmlspark/vw/VowpalWabbitRegressor.py rename to vw/src/main/python/mmlspark/vw/VowpalWabbitRegressor.py diff --git a/src/test/python/mmlsparktest/cyber/anamoly/__init__.py b/vw/src/main/python/mmlspark/vw/__init__.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/anamoly/__init__.py rename to vw/src/main/python/mmlspark/vw/__init__.py diff --git a/src/main/scala/com/microsoft/ml/spark/vw/HasNumBits.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/HasNumBits.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/HasNumBits.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/HasNumBits.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/HasSumcollisions.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/HasSumcollisions.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/HasSumcollisions.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/HasSumcollisions.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VectorUtils.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VectorUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VectorUtils.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VectorUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VectorZipper.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VectorZipper.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VectorZipper.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VectorZipper.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBase.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBase.scala similarity index 99% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBase.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBase.scala index 401daeadd24..59c983aac1b 100644 --- a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBase.scala +++ b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBase.scala @@ -9,7 +9,7 @@ import com.microsoft.ml.spark.codegen.Wrappable import com.microsoft.ml.spark.core.contracts.HasWeightCol import com.microsoft.ml.spark.core.env.StreamUtilities import com.microsoft.ml.spark.core.utils.{ClusterUtil, StopWatch} -import com.microsoft.ml.spark.downloader.FaultToleranceUtils +import com.microsoft.ml.spark.core.utils.FaultToleranceUtils import org.apache.spark.TaskContext import org.apache.spark.internal._ import org.apache.spark.ml.param._ diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBaseModel.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBaseModel.scala similarity index 94% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBaseModel.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBaseModel.scala index 46b85505e73..d0208829915 100644 --- a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBaseModel.scala +++ b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBaseModel.scala @@ -4,15 +4,13 @@ package com.microsoft.ml.spark.vw import com.microsoft.ml.spark.core.env.StreamUtilities -import com.microsoft.ml.spark.downloader.FaultToleranceUtils +import com.microsoft.ml.spark.core.utils.FaultToleranceUtils import org.apache.spark.binary.BinaryFileFormat -import org.apache.spark.ml.ComplexParamsWritable -import org.apache.spark.ml.linalg.{DenseVector, SparseVector} import org.apache.spark.ml.param.{ByteArrayParam, DataFrameParam, Param} import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.functions.{col, struct, udf} import org.apache.spark.sql.types.StructType -import org.vowpalwabbit.spark.{VowpalWabbitArguments, VowpalWabbitExample, VowpalWabbitMurmur, VowpalWabbitNative} +import org.vowpalwabbit.spark.{VowpalWabbitArguments, VowpalWabbitExample, VowpalWabbitNative} import org.vowpalwabbit.spark.prediction.ScalarPrediction import scala.io.Source diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitClassifier.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitClassifier.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitClassifier.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitClassifier.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitContextualBandit.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitContextualBandit.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitContextualBandit.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitContextualBandit.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitInteractions.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitInteractions.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitInteractions.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitInteractions.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitMurmurWithPrefix.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitMurmurWithPrefix.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitMurmurWithPrefix.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitMurmurWithPrefix.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitRegressor.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitRegressor.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitRegressor.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitRegressor.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitUtil.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitUtil.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitUtil.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitUtil.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/BooleanFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/BooleanFeaturizer.scala similarity index 97% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/BooleanFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/BooleanFeaturizer.scala index 75dd1d651ae..7ae43e536d0 100644 --- a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/BooleanFeaturizer.scala +++ b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/BooleanFeaturizer.scala @@ -1,52 +1,52 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.ml.spark.vw.featurizer - -import org.apache.spark.sql.Row -import org.vowpalwabbit.spark.VowpalWabbitMurmur - -import scala.collection.mutable - -/** - * Featurize boolean value into native VW structure. (True = hash(feature name):1, False ignored). - * @param fieldIdx input field index. - * @param columnName used as feature name. - * @param namespaceHash pre-hashed namespace. - * @param mask bit mask applied to final hash. - */ -private[ml] class BooleanFeaturizer(override val fieldIdx: Int, - override val columnName: String, - namespaceHash: Int, mask: Int) - extends Featurizer(fieldIdx) with ElementFeaturizer[Boolean] { - - /** - * Pre-hashed feature index. - */ - val featureIdx: Int = mask & VowpalWabbitMurmur.hash(columnName, namespaceHash) - - /** - * Featurize a single row. - * @param row input row. - * @param indices output indices. - * @param values output values. - * @note this interface isn't very Scala idiomatic, but it avoids lots of allocation. - * Also due to SparseVector limitations we don't support 64bit indices (e.g. indices are signed 32bit ints) - */ - override def featurize(row: Row, - indices: mutable.ArrayBuilder[Int], - values: mutable.ArrayBuilder[Double]): Unit = { - - featurize(0, row.getBoolean(fieldIdx), indices, values) - } - - def featurize(idx: Int, - value: Boolean, - indices: mutable.ArrayBuilder[Int], - values: mutable.ArrayBuilder[Double]): Unit = { - if (value) { - indices += featureIdx + idx - values += 1.0 - } - } -} +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.vw.featurizer + +import org.apache.spark.sql.Row +import org.vowpalwabbit.spark.VowpalWabbitMurmur + +import scala.collection.mutable + +/** + * Featurize boolean value into native VW structure. (True = hash(feature name):1, False ignored). + * @param fieldIdx input field index. + * @param columnName used as feature name. + * @param namespaceHash pre-hashed namespace. + * @param mask bit mask applied to final hash. + */ +private[ml] class BooleanFeaturizer(override val fieldIdx: Int, + override val columnName: String, + namespaceHash: Int, mask: Int) + extends Featurizer(fieldIdx) with ElementFeaturizer[Boolean] { + + /** + * Pre-hashed feature index. + */ + val featureIdx: Int = mask & VowpalWabbitMurmur.hash(columnName, namespaceHash) + + /** + * Featurize a single row. + * @param row input row. + * @param indices output indices. + * @param values output values. + * @note this interface isn't very Scala idiomatic, but it avoids lots of allocation. + * Also due to SparseVector limitations we don't support 64bit indices (e.g. indices are signed 32bit ints) + */ + override def featurize(row: Row, + indices: mutable.ArrayBuilder[Int], + values: mutable.ArrayBuilder[Double]): Unit = { + + featurize(0, row.getBoolean(fieldIdx), indices, values) + } + + def featurize(idx: Int, + value: Boolean, + indices: mutable.ArrayBuilder[Int], + values: mutable.ArrayBuilder[Double]): Unit = { + if (value) { + indices += featureIdx + idx + values += 1.0 + } + } +} diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/ElementFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/ElementFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/ElementFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/ElementFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/Featurizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/Featurizer.scala similarity index 97% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/Featurizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/Featurizer.scala index a8d6bf1353e..deceb8ddd7a 100644 --- a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/Featurizer.scala +++ b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/Featurizer.scala @@ -1,29 +1,29 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.ml.spark.vw.featurizer - -import com.microsoft.ml.spark.vw.VowpalWabbitMurmurWithPrefix -import org.apache.spark.sql.Row - -import scala.collection.mutable - -private[ml] abstract class Featurizer(val fieldIdx: Int) extends Serializable { - - val columnName: String - - /** - * Initialize hasher that already pre-hashes the column prefix. - */ - protected lazy val hasher: VowpalWabbitMurmurWithPrefix = new VowpalWabbitMurmurWithPrefix(columnName) - - /** - * Featurize a single row. - * @param row input row. - * @param indices output indices. - * @param values output values. - * @note this interface isn't very Scala idiomatic, but it avoids lots of allocation. - * Also due to SparseVector limitations we don't support 64bit indices (e.g. indices are signed 32bit ints) - */ - def featurize(row: Row, indices: mutable.ArrayBuilder[Int], values: mutable.ArrayBuilder[Double]): Unit -} +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.vw.featurizer + +import com.microsoft.ml.spark.vw.VowpalWabbitMurmurWithPrefix +import org.apache.spark.sql.Row + +import scala.collection.mutable + +private[ml] abstract class Featurizer(val fieldIdx: Int) extends Serializable { + + val columnName: String + + /** + * Initialize hasher that already pre-hashes the column prefix. + */ + protected lazy val hasher: VowpalWabbitMurmurWithPrefix = new VowpalWabbitMurmurWithPrefix(columnName) + + /** + * Featurize a single row. + * @param row input row. + * @param indices output indices. + * @param values output values. + * @note this interface isn't very Scala idiomatic, but it avoids lots of allocation. + * Also due to SparseVector limitations we don't support 64bit indices (e.g. indices are signed 32bit ints) + */ + def featurize(row: Row, indices: mutable.ArrayBuilder[Int], values: mutable.ArrayBuilder[Double]): Unit +} diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapStringFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapStringFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapStringFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapStringFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/NumericFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/NumericFeaturizer.scala similarity index 97% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/NumericFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/NumericFeaturizer.scala index c7ade02c07c..cc56a1081b3 100644 --- a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/NumericFeaturizer.scala +++ b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/NumericFeaturizer.scala @@ -1,61 +1,61 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.ml.spark.vw.featurizer - -import org.apache.spark.sql.Row -import org.vowpalwabbit.spark.VowpalWabbitMurmur - -import scala.collection.mutable - -/** - * Featurize numeric values into native VW structure. ((hash(column name):value) - * @param fieldIdx input field index. - * @param columnName used as feature name prefix. - * @param namespaceHash pre-hashed namespace. - * @param mask bit mask applied to final hash. - */ -private[ml] class NumericFeaturizer[T: Numeric](override val fieldIdx: Int, - override val columnName: String, - val namespaceHash: Int, - val mask: Int, - val zero: Numeric[T]) - extends Featurizer(fieldIdx) with ElementFeaturizer[T] { - - /** - * Pre-hashed feature index. - */ - val featureIdx: Int = mask & VowpalWabbitMurmur.hash(columnName, namespaceHash) - - override def featurize(row: Row, - indices: mutable.ArrayBuilder[Int], - values: mutable.ArrayBuilder[Double]): Unit = { - featurize(0, row.getAs[T](fieldIdx), indices, values) - } - - def featurize(idx: Int, - value: T, - indices: mutable.ArrayBuilder[Int], - values: mutable.ArrayBuilder[Double]): Unit = { - // Note: 0 valued features are always filtered. - if (value != zero.zero) { - indices += featureIdx + idx - // This is weird but zero is a numeric typeclass that is used to convert the generic T to a double. - values += zero.toDouble(value) - } - () - } -} - -class NullableNumericFeaturizer[T: Numeric](override val fieldIdx: Int, - override val columnName: String, - override val namespaceHash: Int, - override val mask: Int, - override val zero: Numeric[T]) - extends NumericFeaturizer[T](fieldIdx, columnName, namespaceHash, mask, zero) { - override def featurize(row: Row, - indices: mutable.ArrayBuilder[Int], - values: mutable.ArrayBuilder[Double]): Unit = - if (!row.isNullAt(fieldIdx)) - super.featurize(row, indices, values) -} +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.vw.featurizer + +import org.apache.spark.sql.Row +import org.vowpalwabbit.spark.VowpalWabbitMurmur + +import scala.collection.mutable + +/** + * Featurize numeric values into native VW structure. ((hash(column name):value) + * @param fieldIdx input field index. + * @param columnName used as feature name prefix. + * @param namespaceHash pre-hashed namespace. + * @param mask bit mask applied to final hash. + */ +private[ml] class NumericFeaturizer[T: Numeric](override val fieldIdx: Int, + override val columnName: String, + val namespaceHash: Int, + val mask: Int, + val zero: Numeric[T]) + extends Featurizer(fieldIdx) with ElementFeaturizer[T] { + + /** + * Pre-hashed feature index. + */ + val featureIdx: Int = mask & VowpalWabbitMurmur.hash(columnName, namespaceHash) + + override def featurize(row: Row, + indices: mutable.ArrayBuilder[Int], + values: mutable.ArrayBuilder[Double]): Unit = { + featurize(0, row.getAs[T](fieldIdx), indices, values) + } + + def featurize(idx: Int, + value: T, + indices: mutable.ArrayBuilder[Int], + values: mutable.ArrayBuilder[Double]): Unit = { + // Note: 0 valued features are always filtered. + if (value != zero.zero) { + indices += featureIdx + idx + // This is weird but zero is a numeric typeclass that is used to convert the generic T to a double. + values += zero.toDouble(value) + } + () + } +} + +class NullableNumericFeaturizer[T: Numeric](override val fieldIdx: Int, + override val columnName: String, + override val namespaceHash: Int, + override val mask: Int, + override val zero: Numeric[T]) + extends NumericFeaturizer[T](fieldIdx, columnName, namespaceHash, mask, zero) { + override def featurize(row: Row, + indices: mutable.ArrayBuilder[Int], + values: mutable.ArrayBuilder[Double]): Unit = + if (!row.isNullAt(fieldIdx)) + super.featurize(row, indices, values) +} diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/SeqFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/SeqFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/SeqFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/SeqFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringFeaturizer.scala similarity index 97% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringFeaturizer.scala index 804f6b482f2..d5821415228 100644 --- a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringFeaturizer.scala +++ b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringFeaturizer.scala @@ -1,47 +1,47 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.ml.spark.vw.featurizer - -import org.apache.spark.sql.Row - -import scala.collection.mutable - -/** - * Featurize string into native VW structure. (hash(column name + value):1) - * @param fieldIdx input field index. - * @param columnName used as feature name prefix. - * @param namespaceHash pre-hashed namespace. - * @param mask bit mask applied to final hash. - */ -private[ml] class StringFeaturizer(override val fieldIdx: Int, - override val columnName: String, - val namespaceHash: Int, - val mask: Int) - extends Featurizer(fieldIdx) with ElementFeaturizer[String] { - - /** - * Featurize a single row. - * @param row input row. - * @param indices output indices. - * @param values output values. - * @note this interface isn't very Scala-esce, but it avoids lots of allocation. - * Also due to SparseVector limitations we don't support 64bit indices (e.g. indices are signed 32bit ints) - */ - override def featurize(row: Row, indices: mutable.ArrayBuilder[Int], values: mutable.ArrayBuilder[Double]): Unit = { - featurize(0, row.getString(fieldIdx), indices, values) - - () - } - - def featurize(idx: Int, - value: String, - indices: mutable.ArrayBuilder[Int], - values: mutable.ArrayBuilder[Double]): Unit = { - - if (value != null && !value.isEmpty) { - indices += mask & hasher.hash(value, namespaceHash) - values += 1.0 - } - } -} +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.vw.featurizer + +import org.apache.spark.sql.Row + +import scala.collection.mutable + +/** + * Featurize string into native VW structure. (hash(column name + value):1) + * @param fieldIdx input field index. + * @param columnName used as feature name prefix. + * @param namespaceHash pre-hashed namespace. + * @param mask bit mask applied to final hash. + */ +private[ml] class StringFeaturizer(override val fieldIdx: Int, + override val columnName: String, + val namespaceHash: Int, + val mask: Int) + extends Featurizer(fieldIdx) with ElementFeaturizer[String] { + + /** + * Featurize a single row. + * @param row input row. + * @param indices output indices. + * @param values output values. + * @note this interface isn't very Scala-esce, but it avoids lots of allocation. + * Also due to SparseVector limitations we don't support 64bit indices (e.g. indices are signed 32bit ints) + */ + override def featurize(row: Row, indices: mutable.ArrayBuilder[Int], values: mutable.ArrayBuilder[Double]): Unit = { + featurize(0, row.getString(fieldIdx), indices, values) + + () + } + + def featurize(idx: Int, + value: String, + indices: mutable.ArrayBuilder[Int], + values: mutable.ArrayBuilder[Double]): Unit = { + + if (value != null && !value.isEmpty) { + indices += mask & hasher.hash(value, namespaceHash) + values += 1.0 + } + } +} diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringSplitFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringSplitFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringSplitFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringSplitFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StructFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StructFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/StructFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StructFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/VectorFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/VectorFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/VectorFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/VectorFeaturizer.scala diff --git a/src/test/python/mmlsparktest/cyber/feature/__init__.py b/vw/src/test/python/mmlsparktest/vw/__init__.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/feature/__init__.py rename to vw/src/test/python/mmlsparktest/vw/__init__.py diff --git a/src/test/python/mmlsparktest/vw/test_vw.py b/vw/src/test/python/mmlsparktest/vw/test_vw.py similarity index 100% rename from src/test/python/mmlsparktest/vw/test_vw.py rename to vw/src/test/python/mmlsparktest/vw/test_vw.py diff --git a/src/test/python/mmlsparktest/vw/test_vw_cb.py b/vw/src/test/python/mmlsparktest/vw/test_vw_cb.py similarity index 100% rename from src/test/python/mmlsparktest/vw/test_vw_cb.py rename to vw/src/test/python/mmlsparktest/vw/test_vw_cb.py diff --git a/src/test/resources/benchmarks/benchmarks_VerifyVowpalWabbitRegressor.csv b/vw/src/test/resources/benchmarks/benchmarks_VerifyVowpalWabbitRegressor.csv similarity index 100% rename from src/test/resources/benchmarks/benchmarks_VerifyVowpalWabbitRegressor.csv rename to vw/src/test/resources/benchmarks/benchmarks_VerifyVowpalWabbitRegressor.csv diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VWContextualBandidSpec.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VWContextualBandidSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/vw/VWContextualBandidSpec.scala rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VWContextualBandidSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VerifyVectorZipper.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVectorZipper.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/vw/VerifyVectorZipper.scala rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVectorZipper.scala diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitClassifier.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitClassifier.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitClassifier.scala rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitClassifier.scala diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitFeaturizer.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitFeaturizer.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitFeaturizer.scala rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitFeaturizer.scala diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitInteractions.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitInteractions.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitInteractions.scala rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitInteractions.scala diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitMurmurWithPrefix.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitMurmurWithPrefix.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitMurmurWithPrefix.scala rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitMurmurWithPrefix.scala diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitRegressor.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitRegressor.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitRegressor.scala rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitRegressor.scala