diff --git a/.gitignore b/.gitignore index 8fd3247cec1..ec1a9ac15e9 100644 --- a/.gitignore +++ b/.gitignore @@ -47,4 +47,7 @@ node_modules/ .Rproj.user # R output -*.Rout \ No newline at end of file +*.Rout + +# Misc +.bsp diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 46b481c7130..739b6065c41 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -55,7 +55,7 @@ this process: #### Implement documentation -- Add a [sample Jupyter notebook](notebooks/samples) that shows the intended use +- Add a [sample Jupyter notebook](notebooks/) that shows the intended use case of your algorithm, with instructions in step-by-step manner. (The same notebook could be used for testing the code.) - Add in-line ScalaDoc comments to your source code, to generate the [API diff --git a/README.md b/README.md index 58c5cdcec6f..f7618c97eac 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ PySpark](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/index.htm | | | | |:--:|:--:|:--:| -| **Isolation Forest on Spark** | [**CyberML**](https://github.com/Azure/mmlspark/blob/master/notebooks/samples/CyberML%20-%20Anomalous%20Access%20Detection.ipynb) | **Conditional KNN** | +| **Isolation Forest on Spark** | [**CyberML**](https://github.com/Azure/mmlspark/blob/master/notebooks/CyberML%20-%20Anomalous%20Access%20Detection.ipynb) | **Conditional KNN** | | Distributed Nonlinear Outlier Detection | Machine Learning Tools for Cyber Security | Scalable KNN Models with Conditional Queries | @@ -86,29 +86,29 @@ PySpark](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/index.htm - Train and evaluate a flight delay prediction system ([example 2]) - Finding anomalous data access patterns using the Access Anomalies package of CyberML ([example 11]) -See our [notebooks](notebooks/samples/) for all examples. +See our [notebooks](notebooks/) for all examples. -[example 1]: notebooks/samples/Classification%20-%20Adult%20Census.ipynb "Adult Census Income Training" +[example 1]: notebooks/Classification%20-%20Adult%20Census.ipynb "Adult Census Income Training" -[example 2]: notebooks/samples/Regression%20-%20Flight%20Delays.ipynb "Regression Example with Flight Delay Dataset" +[example 2]: notebooks/Regression%20-%20Flight%20Delays.ipynb "Regression Example with Flight Delay Dataset" -[example 3]: notebooks/samples/LightGBM%20-%20Quantile%20Regression%20for%20Drug%20Discovery.ipynb "Quantile Regression with LightGBM" +[example 3]: notebooks/LightGBM%20-%20Quantile%20Regression%20for%20Drug%20Discovery.ipynb "Quantile Regression with LightGBM" -[example 4]: notebooks/samples/TextAnalytics%20-%20Amazon%20Book%20Reviews.ipynb "Amazon Book Reviews - TextFeaturizer" +[example 4]: notebooks/TextAnalytics%20-%20Amazon%20Book%20Reviews.ipynb "Amazon Book Reviews - TextFeaturizer" -[example 5]: notebooks/samples/HyperParameterTuning%20-%20Fighting%20Breast%20Cancer.ipynb "Hyperparameter Tuning with MMLSpark" +[example 5]: notebooks/HyperParameterTuning%20-%20Fighting%20Breast%20Cancer.ipynb "Hyperparameter Tuning with MMLSpark" -[example 6]: notebooks/samples/DeepLearning%20-%20CIFAR10%20Convolutional%20Network.ipynb "CIFAR10 CNTK CNN Evaluation" +[example 6]: notebooks/DeepLearning%20-%20CIFAR10%20Convolutional%20Network.ipynb "CIFAR10 CNTK CNN Evaluation" -[example 7]: notebooks/samples/OpenCV%20-%20Pipeline%20Image%20Transformations.ipynb "Pipeline Image Transformations" +[example 7]: notebooks/OpenCV%20-%20Pipeline%20Image%20Transformations.ipynb "Pipeline Image Transformations" -[example 8]: notebooks/samples/DeepLearning%20-%20BiLSTM%20Medical%20Entity%20Extraction.ipynb "Medical Entity Extraction" +[example 8]: notebooks/DeepLearning%20-%20BiLSTM%20Medical%20Entity%20Extraction.ipynb "Medical Entity Extraction" -[example 9]: notebooks/samples/DeepLearning%20-%20Flower%20Image%20Classification.ipynb "Deep Flower Classification" +[example 9]: notebooks/DeepLearning%20-%20Flower%20Image%20Classification.ipynb "Deep Flower Classification" [example 10]: notebooks/gpu/DeepLearning%20-%20Distributed%20CNTK%20training.ipynb "CIFAR10 CNTK CNN Training" -[example 11]: notebooks/samples/CyberML%20-%20Anomalous%20Access%20Detection.ipynb "Access Anomalies documenation, training and evaluation example" +[example 11]: notebooks/CyberML%20-%20Anomalous%20Access%20Detection.ipynb "Access Anomalies documenation, training and evaluation example" ## A short example @@ -127,7 +127,7 @@ scoredImages = cntkModel.transform(imagesWithLabels) ... ``` -See [other sample notebooks](notebooks/samples/) as well as the MMLSpark +See [other sample notebooks](notebooks/) as well as the MMLSpark documentation for [Scala](http://mmlspark.azureedge.net/docs/scala/) and [PySpark](http://mmlspark.azureedge.net/docs/pyspark/). diff --git a/build.sbt b/build.sbt index 71471d683ee..fd8fad623ad 100644 --- a/build.sbt +++ b/build.sbt @@ -1,22 +1,20 @@ import java.io.{File, PrintWriter} import java.net.URL + import org.apache.commons.io.FileUtils import sbt.ExclusionRule -import sbt.internal.util.ManagedLogger import scala.xml.{Node => XmlNode, NodeSeq => XmlNodeSeq, _} import scala.xml.transform.{RewriteRule, RuleTransformer} -import scala.sys.process.Process import BuildUtils._ +import xerial.sbt.Sonatype._ val condaEnvName = "mmlspark" -name := "mmlspark" -organization := "com.microsoft.ml.spark" -scalaVersion := "2.12.10" val sparkVersion = "3.1.2" +name := "mmlspark" +ThisBuild / organization := "com.microsoft.ml.spark" +ThisBuild / scalaVersion := "2.12.10" -//val scalaMajorVersion = settingKey[String]("scalaMajorVersion") -//scalaMajorVersion := {scalaVersion.value.split(".".toCharArray).dropRight(0).mkString(".")} val scalaMajorVersion = 2.12 val excludes = Seq( @@ -24,42 +22,28 @@ val excludes = Seq( ExclusionRule("org.scalatest") ) -libraryDependencies ++= Seq( +val coreDependencies = Seq( "org.apache.spark" %% "spark-core" % sparkVersion % "compile", "org.apache.spark" %% "spark-mllib" % sparkVersion % "compile", "org.apache.spark" %% "spark-avro" % sparkVersion % "provided", "org.apache.spark" %% "spark-tags" % sparkVersion % "test", "org.scalatest" %% "scalatest" % "3.0.5" % "test") - -libraryDependencies ++= Seq( +val extraDependencies = Seq( "org.scalactic" %% "scalactic" % "3.0.5", "io.spray" %% "spray-json" % "1.3.2", - "com.microsoft.cntk" % "cntk" % "2.4", - "org.openpnp" % "opencv" % "3.2.0-1", "com.jcraft" % "jsch" % "0.1.54", - "com.microsoft.cognitiveservices.speech" % "client-sdk" % "1.14.0", "org.apache.httpcomponents" % "httpclient" % "4.5.6", "org.apache.httpcomponents" % "httpmime" % "4.5.6", - "com.microsoft.ml.lightgbm" % "lightgbmlib" % "3.2.110", - "com.github.vowpalwabbit" % "vw-jni" % "8.9.1", "com.linkedin.isolation-forest" %% "isolation-forest_3.0.0" % "1.0.1", ).map(d => d excludeAll (excludes: _*)) +val dependencies = coreDependencies ++ extraDependencies def txt(e: Elem, label: String): String = "\"" + e.child.filter(_.label == label).flatMap(_.text).mkString + "\"" -def activateCondaEnv: Seq[String] = { - if (sys.props("os.name").toLowerCase.contains("windows")) { - osPrefix ++ Seq("activate", condaEnvName, "&&") - } else { - Seq() - //TODO figure out why this doesent work - //Seq("/bin/bash", "-l", "-c", "source activate " + condaEnvName, "&&") - } -} - val omittedDeps = Set(s"spark-core_${scalaMajorVersion}", s"spark-mllib_${scalaMajorVersion}", "org.scala-lang") // skip dependency elements with a scope -pomPostProcess := { (node: XmlNode) => + +def pomPostFunc(node: XmlNode): scala.xml.Node = { new RuleTransformer(new RewriteRule { override def transform(node: XmlNode): XmlNodeSeq = node match { case e: Elem if e.label == "dependency" @@ -77,191 +61,17 @@ pomPostProcess := { (node: XmlNode) => }).transform(node).head } -resolvers += "Speech" at "https://mmlspark.blob.core.windows.net/maven/" - -val createCondaEnvTask = TaskKey[Unit]("createCondaEnv", "create conda env") -createCondaEnvTask := { - val s = streams.value - val hasEnv = Process("conda env list").lineStream.toList - .map(_.split("\\s+").head).contains(condaEnvName) - if (!hasEnv) { - runCmd(Seq("conda", "env", "create", "-f", "environment.yaml")) - } else { - println("Found conda env " + condaEnvName) - } -} - -val condaEnvLocation = TaskKey[String]("condaEnvLocation", "get install location of conda env") -condaEnvLocation := { - val s = streams.value - createCondaEnvTask.value - Process("conda env list").lineStream.toList - .map(_.split("\\s+")) - .map(l => (l.head, l.reverse.head)) - .filter(p => p._1 == condaEnvName) - .head._2 -} - +pomPostProcess := pomPostFunc -val cleanCondaEnvTask = TaskKey[Unit]("cleanCondaEnv", "create conda env") -cleanCondaEnvTask := { - runCmd(Seq("conda", "env", "remove", "--name", condaEnvName, "-y")) -} - -val codegenTask = TaskKey[Unit]("codegen", "Generate Code") -codegenTask := { - (runMain in Test).toTask(" com.microsoft.ml.spark.codegen.CodeGen").value -} - -val testgenTask = TaskKey[Unit]("testgen", "Generate Tests") -testgenTask := { - (runMain in Test).toTask(" com.microsoft.ml.spark.codegen.TestGen").value -} - -val genDir = join("target", s"scala-${scalaMajorVersion}", "generated") -val unidocDir = join("target", s"scala-${scalaMajorVersion}", "unidoc") -val pythonSrcDir = join(genDir.toString, "src", "python") -val unifiedDocDir = join(genDir.toString, "doc") -val pythonDocDir = join(unifiedDocDir.toString, "pyspark") -val pythonPackageDir = join(genDir.toString, "package", "python") -val pythonTestDir = join(genDir.toString, "test", "python") -val rSrcDir = join(genDir.toString, "src", "R", "mmlspark") -val rPackageDir = join(genDir.toString, "package", "R") - -val pythonizedVersion = settingKey[String]("Pythonized version") -pythonizedVersion := { - if (version.value.contains("-")) { - version.value.split("-".head).head + ".dev1" - } else { - version.value - } -} - -val rVersion = settingKey[String]("R version") -rVersion := { - if (version.value.contains("-")) { - version.value.split("-".head).head - } else { - version.value - } -} - -def rCmd(cmd: Seq[String], wd: File, libPath: String): Unit = { - runCmd(activateCondaEnv ++ cmd, wd, Map("R_LIBS" -> libPath, "R_USER_LIBS" -> libPath)) -} - -val packageR = TaskKey[Unit]("packageR", "Generate roxygen docs and zip R package") -packageR := { - createCondaEnvTask.value - codegenTask.value - val libPath = join(condaEnvLocation.value, "Lib", "R", "library").toString - rCmd(Seq("R", "-q", "-e", "roxygen2::roxygenise()"), rSrcDir, libPath) - rPackageDir.mkdirs() - zipFolder(rSrcDir, new File(rPackageDir, s"mmlspark-${version.value}.zip")) -} - -val testR = TaskKey[Unit]("testR", "Run testthat on R tests") -testR := { - packageR.value - publishLocal.value - val libPath = join(condaEnvLocation.value, "Lib", "R", "library").toString - rCmd(Seq("R", "CMD", "INSTALL", "--no-multiarch", "--with-keep.source", "mmlspark"), rSrcDir.getParentFile, libPath) - val testRunner = join("tools", "tests", "run_r_tests.R").getAbsolutePath - rCmd(Seq("Rscript", testRunner), rSrcDir, libPath) -} - -val publishR = TaskKey[Unit]("publishR", "publish R package to blob") -publishR := { - codegenTask.value - packageR.value - val rPackage = rPackageDir.listFiles().head - singleUploadToBlob(rPackage.toString, rPackage.getName, "rrr") -} - -val packagePythonTask = TaskKey[Unit]("packagePython", "Package python sdk") -packagePythonTask := { - codegenTask.value - createCondaEnvTask.value - val destPyDir = join("target", s"scala-${scalaMajorVersion}", "classes", "mmlspark") - if (destPyDir.exists()) FileUtils.forceDelete(destPyDir) - FileUtils.copyDirectory(join(pythonSrcDir.getAbsolutePath, "mmlspark"), destPyDir) - runCmd( - activateCondaEnv ++ - Seq(s"python", "setup.py", "bdist_wheel", "--universal", "-d", s"${pythonPackageDir.absolutePath}"), - pythonSrcDir) -} - -val installPipPackageTask = TaskKey[Unit]("installPipPackage", "install python sdk") -installPipPackageTask := { - packagePythonTask.value - publishLocal.value - runCmd( - activateCondaEnv ++ Seq("pip", "install", "-I", - s"mmlspark-${pythonizedVersion.value}-py2.py3-none-any.whl"), - pythonPackageDir) -} - -val generatePythonDoc = TaskKey[Unit]("generatePythonDoc", "Generate sphinx docs for python") -generatePythonDoc := { - installPipPackageTask.value - runCmd(activateCondaEnv ++ Seq("sphinx-apidoc", "-f", "-o", "doc", "."), - join(pythonSrcDir.toString, "mmlspark")) - runCmd(activateCondaEnv ++ Seq("sphinx-build", "-b", "html", "doc", "../../../doc/pyspark"), - join(pythonSrcDir.toString, "mmlspark")) -} - -val publishDocs = TaskKey[Unit]("publishDocs", "publish docs for scala and python") -publishDocs := { - generatePythonDoc.value - (Compile / unidoc).value - val html = - """ - |
- |pyspark/ - |scala/ - |- """.stripMargin - val scalaDir = join(unifiedDocDir.toString, "scala") - if (scalaDir.exists()) FileUtils.forceDelete(scalaDir) - FileUtils.copyDirectory(unidocDir, scalaDir) - FileUtils.writeStringToFile(join(unifiedDocDir.toString, "index.html"), html, "utf-8") - uploadToBlob(unifiedDocDir.toString, version.value, "docs") -} - -val publishPython = TaskKey[Unit]("publishPython", "publish python wheel") -publishPython := { - publishLocal.value - packagePythonTask.value - singleUploadToBlob( - join(pythonPackageDir.toString, s"mmlspark-${pythonizedVersion.value}-py2.py3-none-any.whl").toString, - version.value + s"/mmlspark-${pythonizedVersion.value}-py2.py3-none-any.whl", - "pip") -} - -val testPythonTask = TaskKey[Unit]("testPython", "test python sdk") - -testPythonTask := { - installPipPackageTask.value - testgenTask.value - runCmd( - activateCondaEnv ++ Seq("python", - "-m", - "pytest", - "--cov=mmlspark", - "--junitxml=../../../../python-test-results.xml", - "--cov-report=xml", - "mmlsparktest" - ), - new File(s"target/scala-${scalaMajorVersion}/generated/test/python/") - ) -} +val speechResolver = "Speech" at "https://mmlspark.blob.core.windows.net/maven/" val getDatasetsTask = TaskKey[Unit]("getDatasets", "download datasets used for testing") val datasetName = "datasets-2020-08-27.tgz" val datasetUrl = new URL(s"https://mmlspark.blob.core.windows.net/installers/$datasetName") val datasetDir = settingKey[File]("The directory that holds the dataset") -datasetDir := { - join(target.value.toString, s"scala-${scalaMajorVersion}", "datasets", datasetName.split(".".toCharArray.head).head) +ThisBuild / datasetDir := { + join(artifactPath.in(packageBin).in(Compile).value.getParentFile, + "datasets", datasetName.split(".".toCharArray.head).head) } getDatasetsTask := { @@ -276,48 +86,61 @@ getDatasetsTask := { val genBuildInfo = TaskKey[Unit]("genBuildInfo", "generate a build info file") genBuildInfo := { - val buildInfo = + val docInfo = s""" - |MMLSpark Build and Release Information - |--------------- - | - |### Maven Coordinates - | `${organization.value}:${name.value}_${scalaMajorVersion}:${version.value}` - | - |### Maven Resolver - | `https://mmlspark.azureedge.net/maven` | |### Documentation Pages: |[Scala Documentation](https://mmlspark.blob.core.windows.net/docs/${version.value}/scala/index.html) |[Python Documentation](https://mmlspark.blob.core.windows.net/docs/${version.value}/pyspark/index.html) | """.stripMargin + val buildInfo = (root / blobArtifactInfo).value + docInfo val infoFile = join("target", "Build.md") if (infoFile.exists()) FileUtils.forceDelete(infoFile) FileUtils.writeStringToFile(infoFile, buildInfo, "utf-8") } -val setupTask = TaskKey[Unit]("setup", "set up library for intellij") -setupTask := { - (Compile / compile).toTask.value - (Test / compile).toTask.value - getDatasetsTask.value +val rootGenDir = SettingKey[File]("rootGenDir") +rootGenDir := { + val targetDir = artifactPath.in(packageBin).in(Compile).in(root).value.getParentFile + join(targetDir, "generated") } -val publishBlob = TaskKey[Unit]("publishBlob", "publish the library to mmlspark blob") -publishBlob := { - publishM2.value - val scalaVersionSuffix = scalaVersion.value.split(".".toCharArray.head).dropRight(1).mkString(".") - val nameAndScalaVersion = s"${name.value}_$scalaVersionSuffix" - - val localPackageFolder = join( - Seq(new File(new URI(Resolver.mavenLocal.root)).getAbsolutePath) - ++ organization.value.split(".".toCharArray.head) - ++ Seq(nameAndScalaVersion, version.value): _*).toString +val generatePythonDoc = TaskKey[Unit]("generatePythonDoc", "Generate sphinx docs for python") +generatePythonDoc := { + installPipPackage.all(ScopeFilter( + inProjects(core, deepLearning, cognitive, vw, lightgbm, opencv), + inConfigurations(Compile))).value + mergePyCode.all(ScopeFilter( + inProjects(core, deepLearning, cognitive, vw, lightgbm, opencv), + inConfigurations(Compile)) + ).value + val targetDir = artifactPath.in(packageBin).in(Compile).in(root).value.getParentFile + val codegenDir = join(targetDir, "generated") + val dir = join(codegenDir, "src", "python", "mmlspark") + runCmd(activateCondaEnv.value ++ Seq("sphinx-apidoc", "-f", "-o", "doc", "."), dir) + runCmd(activateCondaEnv.value ++ Seq("sphinx-build", "-b", "html", "doc", "../../../doc/pyspark"), dir) +} - val blobMavenFolder = organization.value.replace(".", "/") + - s"/$nameAndScalaVersion/${version.value}" - uploadToBlob(localPackageFolder, blobMavenFolder, "maven") +val publishDocs = TaskKey[Unit]("publishDocs", "publish docs for scala and python") +publishDocs := { + //generatePythonDoc.value + (root / Compile / unidoc).value + val html = + """ + |
+ |pyspark/ + |scala/ + |+ """.stripMargin + val targetDir = artifactPath.in(packageBin).in(Compile).in(root).value.getParentFile + val codegenDir = join(targetDir, "generated") + val unifiedDocDir = join(codegenDir, "doc") + val scalaDir = join(unifiedDocDir.toString, "scala") + if (scalaDir.exists()) FileUtils.forceDelete(scalaDir) + FileUtils.copyDirectory(join(targetDir, "unidoc"), scalaDir) + FileUtils.writeStringToFile(join(unifiedDocDir.toString, "index.html"), html, "utf-8") + uploadToBlob(unifiedDocDir.toString, version.value, "docs") } val release = TaskKey[Unit]("release", "publish the library to mmlspark blob") @@ -355,11 +178,8 @@ publishBadges := { } val settings = Seq( - (scalastyleConfig in Test) := baseDirectory.value / "scalastyle-test-config.xml", + (scalastyleConfig in Test) := (ThisBuild / baseDirectory).value / "scalastyle-test-config.xml", logBuffered in Test := false, - buildInfoKeys := Seq[BuildInfoKey]( - name, version, scalaVersion, sbtVersion, - baseDirectory, datasetDir, pythonizedVersion, rVersion), parallelExecution in Test := false, test in assembly := {}, assemblyMergeStrategy in assembly := { @@ -367,14 +187,90 @@ val settings = Seq( case x => MergeStrategy.first }, assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false), - buildInfoPackage := "com.microsoft.ml.spark.build") - -lazy val mmlspark = (project in file(".")) - .enablePlugins(BuildInfoPlugin) - .enablePlugins(ScalaUnidocPlugin) - .settings(settings: _*) + autoAPIMappings := true, + pomPostProcess := pomPostFunc, +) +ThisBuild / publishMavenStyle := true + +lazy val core = (project in file("core")) + .enablePlugins(BuildInfoPlugin && SbtPlugin) + .settings((settings ++ Seq( + libraryDependencies ++= dependencies, + buildInfoKeys ++= Seq[BuildInfoKey]( + datasetDir, + version, + scalaVersion, + sbtVersion, + baseDirectory + ), + name := "mmlspark-core", + buildInfoPackage := "com.microsoft.ml.spark.build", + )): _*) + +lazy val deepLearning = (project in file("deep-learning")) + .enablePlugins(SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + libraryDependencies += ("com.microsoft.cntk" % "cntk" % "2.4"), + name := "mmlspark-deep-learning", + )): _*) + +lazy val lightgbm = (project in file("lightgbm")) + .enablePlugins(SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + libraryDependencies += ("com.microsoft.ml.lightgbm" % "lightgbmlib" % "3.2.110"), + name := "mmlspark-lightgbm" + )): _*) + +lazy val vw = (project in file("vw")) + .enablePlugins(SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + libraryDependencies += ("com.github.vowpalwabbit" % "vw-jni" % "8.9.1"), + name := "mmlspark-vw" + )): _*) + +lazy val cognitive = (project in file("cognitive")) + .enablePlugins(SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + libraryDependencies += ("com.microsoft.cognitiveservices.speech" % "client-sdk" % "1.14.0"), + resolvers += speechResolver, + name := "mmlspark-cognitive" + )): _*) + +lazy val opencv = (project in file("opencv")) + .enablePlugins(SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + libraryDependencies += ("org.openpnp" % "opencv" % "3.2.0-1"), + name := "mmlspark-opencv" + )): _*) + +lazy val root = (project in file(".")) + .aggregate(core, deepLearning, cognitive, vw, lightgbm, opencv) + .dependsOn( + core % "test->test;compile->compile", + deepLearning % "test->test;compile->compile", + cognitive % "test->test;compile->compile", + vw % "test->test;compile->compile", + lightgbm % "test->test;compile->compile", + opencv % "test->test;compile->compile") + .enablePlugins(ScalaUnidocPlugin && SbtPlugin) + .disablePlugins(CodegenPlugin) + .settings(settings ++ Seq( + name := "mmlspark", + )) -import xerial.sbt.Sonatype._ +val setupTask = TaskKey[Unit]("setup", "set up library for intellij") +setupTask := { + compile.all(ScopeFilter( + inProjects(root, core, deepLearning, cognitive, vw, lightgbm, opencv), + inConfigurations(Compile, Test)) + ).value + getDatasetsTask.value +} sonatypeProjectHosting := Some( GitHubHosting("Azure", "MMLSpark", "mmlspark-support@microsot.com")) @@ -389,33 +285,30 @@ developers := List( ) licenses += ("MIT", url("https://github.com/Azure/mmlspark/blob/master/LICENSE")) -publishMavenStyle := true - -credentials += Credentials("Sonatype Nexus Repository Manager", - "oss.sonatype.org", - Secrets.nexusUsername, - Secrets.nexusPassword) - -pgpPassphrase := Some(Secrets.pgpPassword.toCharArray) -pgpSecretRing := { - val temp = File.createTempFile("secret", ".asc") - new PrintWriter(temp) { - write(Secrets.pgpPrivate); - close() - } - temp -} -pgpPublicRing := { - val temp = File.createTempFile("public", ".asc") - new PrintWriter(temp) { - write(Secrets.pgpPublic); - close() - } - temp -} +// +//credentials += Credentials("Sonatype Nexus Repository Manager", +// "oss.sonatype.org", +// Secrets.nexusUsername, +// Secrets.nexusPassword) +// +//pgpPassphrase := Some(Secrets.pgpPassword.toCharArray) +//pgpSecretRing := { +// val temp = File.createTempFile("secret", ".asc") +// new PrintWriter(temp) { +// write(Secrets.pgpPrivate); +// close() +// } +// temp +//} +//pgpPublicRing := { +// val temp = File.createTempFile("public", ".asc") +// new PrintWriter(temp) { +// write(Secrets.pgpPublic); +// close() +// } +// temp +//} +//publishTo := sonatypePublishToBundle.value dynverSonatypeSnapshots in ThisBuild := true dynverSeparator in ThisBuild := "-" -publishTo := sonatypePublishToBundle.value - -// Break Cache - 1 diff --git a/src/main/python/mmlspark/cognitive/AzureSearchWriter.py b/cognitive/src/main/python/mmlspark/cognitive/AzureSearchWriter.py similarity index 100% rename from src/main/python/mmlspark/cognitive/AzureSearchWriter.py rename to cognitive/src/main/python/mmlspark/cognitive/AzureSearchWriter.py diff --git a/src/main/python/mmlspark/cognitive/BingImageSearch.py b/cognitive/src/main/python/mmlspark/cognitive/BingImageSearch.py similarity index 100% rename from src/main/python/mmlspark/cognitive/BingImageSearch.py rename to cognitive/src/main/python/mmlspark/cognitive/BingImageSearch.py diff --git a/src/__init__.py b/cognitive/src/main/python/mmlspark/cognitive/__init__.py similarity index 100% rename from src/__init__.py rename to cognitive/src/main/python/mmlspark/cognitive/__init__.py diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AnamolyDetection.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AnamolyDetection.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AnamolyDetection.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AnamolyDetection.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AnomalyDetectorSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AnomalyDetectorSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AnomalyDetectorSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AnomalyDetectorSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AudioStreams.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AudioStreams.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AudioStreams.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AudioStreams.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala similarity index 99% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala index 96024a68b63..b405bb13b09 100644 --- a/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala +++ b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala @@ -143,7 +143,8 @@ object AzureSearchWriter extends IndexParser with SLogging { val Logger: Logger = LogManager.getRootLogger - private def checkForErrors(fatal: Boolean)(errorRow: Row, inputRow: Row): Option[Row] = { + private def checkForErrors( + fatal: Boolean)(errorRow: Row, inputRow: Row): Option[Row] = { Option(errorRow).map { r => val message = s"Service Exception:\n\t ${r.toString()} \n for input:\n\t ${inputRow.toString()}" if (fatal) { diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchAPI.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchAPI.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchAPI.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchAPI.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/BingImageSearch.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/BingImageSearch.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/BingImageSearch.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/BingImageSearch.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceBase.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceBase.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceBase.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceBase.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVision.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVision.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVision.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVision.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVisionSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVisionSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVisionSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVisionSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/Face.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/Face.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/Face.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/Face.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/FaceSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/FaceSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/FaceSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/FaceSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/ImageSearchSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ImageSearchSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/ImageSearchSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ImageSearchSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/OCRSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/OCRSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/OCRSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/OCRSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/RESTHelpers.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/RESTHelpers.scala similarity index 96% rename from src/main/scala/com/microsoft/ml/spark/cognitive/RESTHelpers.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/RESTHelpers.scala index 3e4dc4e4a14..01de211a8e0 100644 --- a/src/main/scala/com/microsoft/ml/spark/cognitive/RESTHelpers.scala +++ b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/RESTHelpers.scala @@ -59,11 +59,11 @@ object RESTHelpers { response } else { val requestBodyOpt = Try(request match { - case er: HttpEntityEnclosingRequestBase => IOUtils.toString(er.getEntity.getContent) + case er: HttpEntityEnclosingRequestBase => IOUtils.toString(er.getEntity.getContent, "UTF-8") case _ => "" }).get - val responseBodyOpt = Try(IOUtils.toString(response.getEntity.getContent)).getOrElse("") + val responseBodyOpt = Try(IOUtils.toString(response.getEntity.getContent, "UTF-8")).getOrElse("") throw new RuntimeException( s"Failed: " + diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechAPI.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechAPI.scala similarity index 93% rename from src/main/scala/com/microsoft/ml/spark/cognitive/SpeechAPI.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechAPI.scala index 361c63507cf..b240da1a95f 100644 --- a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechAPI.scala +++ b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechAPI.scala @@ -32,7 +32,7 @@ object SpeechAPI { using(Client.execute(request)) { response => if (!response.getStatusLine.getStatusCode.toString.startsWith("2")) { val bodyOpt = request match { - case er: HttpEntityEnclosingRequestBase => IOUtils.toString(er.getEntity.getContent) + case er: HttpEntityEnclosingRequestBase => IOUtils.toString(er.getEntity.getContent, "UTF-8") case _ => "" } throw new RuntimeException( @@ -40,7 +40,7 @@ object SpeechAPI { s"requestUrl: ${request.getURI}" + s"requestBody: $bodyOpt") } - IOUtils.toString(response.getEntity.getContent) + IOUtils.toString(response.getEntity.getContent, "UTF-8") .parseJson.asJsObject().fields("Signature").compactPrint }.get }) diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/SpeechSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToText.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToText.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToText.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToText.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala similarity index 98% rename from src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala index 51a965b0d08..45447ac5f2d 100644 --- a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala +++ b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala @@ -8,15 +8,17 @@ import java.lang.ProcessBuilder.Redirect import java.net.{URI, URL} import java.util.UUID import java.util.concurrent.{LinkedBlockingQueue, TimeUnit} + import com.microsoft.cognitiveservices.speech._ import com.microsoft.cognitiveservices.speech.audio._ -import com.microsoft.cognitiveservices.speech.transcription.{Conversation, ConversationTranscriber, - ConversationTranscriptionEventArgs, Participant} +import com.microsoft.cognitiveservices.speech.transcription.{ + Conversation, ConversationTranscriber, ConversationTranscriptionEventArgs, Participant} import com.microsoft.cognitiveservices.speech.util.EventHandler import com.microsoft.ml.spark.build.BuildInfo import com.microsoft.ml.spark.cognitive.SpeechFormat._ import com.microsoft.ml.spark.core.contracts.HasOutputCol import com.microsoft.ml.spark.core.schema.{DatasetExtensions, SparkBindings} +import com.microsoft.ml.spark.core.utils.OsUtils import com.microsoft.ml.spark.io.http.HasURL import com.microsoft.ml.spark.logging.BasicLogging import com.microsoft.ml.spark.{CompressedStream, WavStream} @@ -36,10 +38,6 @@ import spray.json._ import scala.concurrent.{ExecutionContext, Future, blocking} import scala.language.existentials -object OsUtils { - val IsWindows: Boolean = System.getProperty("os.name").toLowerCase().indexOf("win") >= 0 -} - object SpeechToTextSDK extends ComplexParamsReadable[SpeechToTextSDK] private[ml] class BlockingQueueIterator[T](lbq: LinkedBlockingQueue[Option[T]], diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalytics.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalytics.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalytics.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalytics.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemasV2.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemasV2.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemasV2.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemasV2.scala diff --git a/src/main/__init__.py b/cognitive/src/test/python/mmlsparktest/cognitive/__init__.py similarity index 100% rename from src/main/__init__.py rename to cognitive/src/test/python/mmlsparktest/cognitive/__init__.py diff --git a/src/test/python/mmlsparktest/cognitive/test_simple.py b/cognitive/src/test/python/mmlsparktest/cognitive/test_simple.py similarity index 100% rename from src/test/python/mmlsparktest/cognitive/test_simple.py rename to cognitive/src/test/python/mmlsparktest/cognitive/test_simple.py diff --git a/src/test/resources/audio1.txt b/cognitive/src/test/resources/audio1.txt similarity index 100% rename from src/test/resources/audio1.txt rename to cognitive/src/test/resources/audio1.txt diff --git a/src/test/resources/audio1.wav b/cognitive/src/test/resources/audio1.wav similarity index 100% rename from src/test/resources/audio1.wav rename to cognitive/src/test/resources/audio1.wav diff --git a/src/test/resources/audio2.txt b/cognitive/src/test/resources/audio2.txt similarity index 100% rename from src/test/resources/audio2.txt rename to cognitive/src/test/resources/audio2.txt diff --git a/src/test/resources/audio2.wav b/cognitive/src/test/resources/audio2.wav similarity index 100% rename from src/test/resources/audio2.wav rename to cognitive/src/test/resources/audio2.wav diff --git a/src/test/resources/audio3.mp3 b/cognitive/src/test/resources/audio3.mp3 similarity index 100% rename from src/test/resources/audio3.mp3 rename to cognitive/src/test/resources/audio3.mp3 diff --git a/src/test/resources/audio3.txt b/cognitive/src/test/resources/audio3.txt similarity index 100% rename from src/test/resources/audio3.txt rename to cognitive/src/test/resources/audio3.txt diff --git a/src/test/resources/audio4.txt b/cognitive/src/test/resources/audio4.txt similarity index 100% rename from src/test/resources/audio4.txt rename to cognitive/src/test/resources/audio4.txt diff --git a/src/test/resources/dialogue.mp3 b/cognitive/src/test/resources/dialogue.mp3 similarity index 100% rename from src/test/resources/dialogue.mp3 rename to cognitive/src/test/resources/dialogue.mp3 diff --git a/src/test/resources/lily.wav b/cognitive/src/test/resources/lily.wav similarity index 100% rename from src/test/resources/lily.wav rename to cognitive/src/test/resources/lily.wav diff --git a/src/test/resources/mark.wav b/cognitive/src/test/resources/mark.wav similarity index 100% rename from src/test/resources/mark.wav rename to cognitive/src/test/resources/mark.wav diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/AnamolyDetectionSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/AnamolyDetectionSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/AnamolyDetectionSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/AnamolyDetectionSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala similarity index 99% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala index 11a75834a4f..6255d9462b4 100644 --- a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala +++ b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala @@ -9,12 +9,10 @@ import com.microsoft.ml.spark.core.test.base.{Flaky, TestBase} import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.ml.NamespaceInjections.pipelineModel import org.apache.spark.ml.util.MLReadable -import org.apache.spark.sql.functions.{corr, typedLit} +import org.apache.spark.sql.functions.typedLit import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.scalactic.Equality -import org.scalatest.Assertion import com.microsoft.ml.spark.FluentAPI._ -import com.microsoft.ml.spark.featurize.text.PageSplitter trait CognitiveKey { lazy val cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", Secrets.CognitiveApiKey) diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceAPI.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceAPI.scala similarity index 98% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceAPI.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceAPI.scala index 3cc8c4eefcf..3b1744c63f4 100644 --- a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceAPI.scala +++ b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceAPI.scala @@ -35,7 +35,7 @@ object FaceUtils extends CognitiveKey { using(Client.execute(request)) { response => if (!response.getStatusLine.getStatusCode.toString.startsWith("2")) { val bodyOpt = request match { - case er: HttpEntityEnclosingRequestBase => IOUtils.toString(er.getEntity.getContent) + case er: HttpEntityEnclosingRequestBase => IOUtils.toString(er.getEntity.getContent, "UTF-8") case _ => "" } throw new RuntimeException( @@ -43,7 +43,7 @@ object FaceUtils extends CognitiveKey { s"requestUrl: ${request.getURI}" + s"requestBody: $bodyOpt") } - IOUtils.toString(response.getEntity.getContent) + IOUtils.toString(response.getEntity.getContent, "UTF-8") }.get }) } diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ImageSearchSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ImageSearchSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/ImageSearchSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ImageSearchSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/TextAnalyticsSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/TextAnalyticsSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/TextAnalyticsSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/TextAnalyticsSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SearchWriterSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SearchWriterSuite.scala similarity index 99% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split2/SearchWriterSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SearchWriterSuite.scala index 0f543420bd2..9b8d91af8ae 100644 --- a/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SearchWriterSuite.scala +++ b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SearchWriterSuite.scala @@ -14,7 +14,8 @@ import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.http.client.methods.HttpDelete import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.functions.{lit, udf, col, split} +import org.apache.spark.sql.functions.{col, lit, split, udf} + import scala.collection.mutable import scala.concurrent.blocking diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SpeechToTextSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SpeechToTextSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split2/SpeechToTextSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SpeechToTextSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split3/SpeechToTextSDKSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split3/SpeechToTextSDKSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split3/SpeechToTextSDKSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split3/SpeechToTextSDKSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/utils/ModelEqualitySuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/core/utils/utils/ModelEqualitySuite.scala similarity index 94% rename from src/test/scala/com/microsoft/ml/spark/core/utils/ModelEqualitySuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/core/utils/utils/ModelEqualitySuite.scala index 620ab98aa28..d88d70d63af 100644 --- a/src/test/scala/com/microsoft/ml/spark/core/utils/ModelEqualitySuite.scala +++ b/cognitive/src/test/scala/com/microsoft/ml/spark/core/utils/utils/ModelEqualitySuite.scala @@ -1,11 +1,12 @@ // Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. -package com.microsoft.ml.spark.core.utils +package com.microsoft.ml.spark.core.utils.utils import com.microsoft.ml.spark.cognitive.TextSentiment import com.microsoft.ml.spark.core.env.FileUtilities.join import com.microsoft.ml.spark.core.test.base.TestBase +import com.microsoft.ml.spark.core.utils.ModelEquality import com.microsoft.ml.spark.stages.DropColumns class ModelEqualitySuite extends TestBase { diff --git a/src/test/scala/com/microsoft/ml/spark/core/utils/SlicerFunctionsSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/core/utils/utils/SlicerFunctionsSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/utils/SlicerFunctionsSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/core/utils/utils/SlicerFunctionsSuite.scala diff --git a/src/main/python/LICENSE.txt b/core/src/main/python/LICENSE.txt similarity index 100% rename from src/main/python/LICENSE.txt rename to core/src/main/python/LICENSE.txt diff --git a/src/main/python/MANIFEST.in b/core/src/main/python/MANIFEST.in similarity index 100% rename from src/main/python/MANIFEST.in rename to core/src/main/python/MANIFEST.in diff --git a/src/main/python/__init__.py b/core/src/main/python/__init__.py similarity index 100% rename from src/main/python/__init__.py rename to core/src/main/python/__init__.py diff --git a/src/main/python/mmlspark/README.txt b/core/src/main/python/mmlspark/README.txt similarity index 100% rename from src/main/python/mmlspark/README.txt rename to core/src/main/python/mmlspark/README.txt diff --git a/src/main/python/mmlspark/__init__.py b/core/src/main/python/mmlspark/__init__.py similarity index 100% rename from src/main/python/mmlspark/__init__.py rename to core/src/main/python/mmlspark/__init__.py diff --git a/src/main/python/mmlspark/automl/BestModel.py b/core/src/main/python/mmlspark/automl/BestModel.py similarity index 100% rename from src/main/python/mmlspark/automl/BestModel.py rename to core/src/main/python/mmlspark/automl/BestModel.py diff --git a/src/main/python/mmlspark/automl/HyperparamBuilder.py b/core/src/main/python/mmlspark/automl/HyperparamBuilder.py similarity index 100% rename from src/main/python/mmlspark/automl/HyperparamBuilder.py rename to core/src/main/python/mmlspark/automl/HyperparamBuilder.py diff --git a/src/main/python/mmlspark/automl/TuneHyperparametersModel.py b/core/src/main/python/mmlspark/automl/TuneHyperparametersModel.py similarity index 100% rename from src/main/python/mmlspark/automl/TuneHyperparametersModel.py rename to core/src/main/python/mmlspark/automl/TuneHyperparametersModel.py diff --git a/src/main/python/mmlspark/automl/__init__.py b/core/src/main/python/mmlspark/automl/__init__.py similarity index 100% rename from src/main/python/mmlspark/automl/__init__.py rename to core/src/main/python/mmlspark/automl/__init__.py diff --git a/src/main/python/mmlspark/cntk/__init__.py b/core/src/main/python/mmlspark/core/__init__.py similarity index 100% rename from src/main/python/mmlspark/cntk/__init__.py rename to core/src/main/python/mmlspark/core/__init__.py diff --git a/src/main/python/mmlspark/core/schema/TypeConversionUtils.py b/core/src/main/python/mmlspark/core/schema/TypeConversionUtils.py similarity index 100% rename from src/main/python/mmlspark/core/schema/TypeConversionUtils.py rename to core/src/main/python/mmlspark/core/schema/TypeConversionUtils.py diff --git a/src/main/python/mmlspark/core/schema/Utils.py b/core/src/main/python/mmlspark/core/schema/Utils.py similarity index 100% rename from src/main/python/mmlspark/core/schema/Utils.py rename to core/src/main/python/mmlspark/core/schema/Utils.py diff --git a/src/main/python/mmlspark/cognitive/__init__.py b/core/src/main/python/mmlspark/core/schema/__init__.py similarity index 100% rename from src/main/python/mmlspark/cognitive/__init__.py rename to core/src/main/python/mmlspark/core/schema/__init__.py diff --git a/src/main/python/mmlspark/core/__init__.py b/core/src/main/python/mmlspark/core/serialize/__init__.py similarity index 100% rename from src/main/python/mmlspark/core/__init__.py rename to core/src/main/python/mmlspark/core/serialize/__init__.py diff --git a/src/main/python/mmlspark/core/serialize/java_params_patch.py b/core/src/main/python/mmlspark/core/serialize/java_params_patch.py similarity index 100% rename from src/main/python/mmlspark/core/serialize/java_params_patch.py rename to core/src/main/python/mmlspark/core/serialize/java_params_patch.py diff --git a/src/main/python/mmlspark/core/spark/FluentAPI.py b/core/src/main/python/mmlspark/core/spark/FluentAPI.py similarity index 100% rename from src/main/python/mmlspark/core/spark/FluentAPI.py rename to core/src/main/python/mmlspark/core/spark/FluentAPI.py diff --git a/src/main/python/mmlspark/core/schema/__init__.py b/core/src/main/python/mmlspark/core/spark/__init__.py similarity index 100% rename from src/main/python/mmlspark/core/schema/__init__.py rename to core/src/main/python/mmlspark/core/spark/__init__.py diff --git a/src/main/python/mmlspark/core/serialize/__init__.py b/core/src/main/python/mmlspark/cyber/__init__.py similarity index 100% rename from src/main/python/mmlspark/core/serialize/__init__.py rename to core/src/main/python/mmlspark/cyber/__init__.py diff --git a/src/main/python/mmlspark/core/spark/__init__.py b/core/src/main/python/mmlspark/cyber/anomaly/__init__.py similarity index 100% rename from src/main/python/mmlspark/core/spark/__init__.py rename to core/src/main/python/mmlspark/cyber/anomaly/__init__.py diff --git a/src/main/python/mmlspark/cyber/anomaly/collaborative_filtering.py b/core/src/main/python/mmlspark/cyber/anomaly/collaborative_filtering.py similarity index 100% rename from src/main/python/mmlspark/cyber/anomaly/collaborative_filtering.py rename to core/src/main/python/mmlspark/cyber/anomaly/collaborative_filtering.py diff --git a/src/main/python/mmlspark/cyber/anomaly/complement_access.py b/core/src/main/python/mmlspark/cyber/anomaly/complement_access.py similarity index 100% rename from src/main/python/mmlspark/cyber/anomaly/complement_access.py rename to core/src/main/python/mmlspark/cyber/anomaly/complement_access.py diff --git a/src/main/python/mmlspark/cyber/dataset.py b/core/src/main/python/mmlspark/cyber/dataset.py similarity index 100% rename from src/main/python/mmlspark/cyber/dataset.py rename to core/src/main/python/mmlspark/cyber/dataset.py diff --git a/src/main/python/mmlspark/cyber/__init__.py b/core/src/main/python/mmlspark/cyber/feature/__init__.py similarity index 100% rename from src/main/python/mmlspark/cyber/__init__.py rename to core/src/main/python/mmlspark/cyber/feature/__init__.py diff --git a/src/main/python/mmlspark/cyber/feature/indexers.py b/core/src/main/python/mmlspark/cyber/feature/indexers.py similarity index 100% rename from src/main/python/mmlspark/cyber/feature/indexers.py rename to core/src/main/python/mmlspark/cyber/feature/indexers.py diff --git a/src/main/python/mmlspark/cyber/feature/scalers.py b/core/src/main/python/mmlspark/cyber/feature/scalers.py similarity index 100% rename from src/main/python/mmlspark/cyber/feature/scalers.py rename to core/src/main/python/mmlspark/cyber/feature/scalers.py diff --git a/src/main/python/mmlspark/cyber/anomaly/__init__.py b/core/src/main/python/mmlspark/cyber/utils/__init__.py similarity index 100% rename from src/main/python/mmlspark/cyber/anomaly/__init__.py rename to core/src/main/python/mmlspark/cyber/utils/__init__.py diff --git a/src/main/python/mmlspark/cyber/utils/spark_utils.py b/core/src/main/python/mmlspark/cyber/utils/spark_utils.py similarity index 100% rename from src/main/python/mmlspark/cyber/utils/spark_utils.py rename to core/src/main/python/mmlspark/cyber/utils/spark_utils.py diff --git a/src/main/python/mmlspark/doc/conf.py b/core/src/main/python/mmlspark/doc/conf.py similarity index 100% rename from src/main/python/mmlspark/doc/conf.py rename to core/src/main/python/mmlspark/doc/conf.py diff --git a/src/main/python/mmlspark/doc/index.rst b/core/src/main/python/mmlspark/doc/index.rst similarity index 100% rename from src/main/python/mmlspark/doc/index.rst rename to core/src/main/python/mmlspark/doc/index.rst diff --git a/src/main/python/mmlspark/doc/scala.rst b/core/src/main/python/mmlspark/doc/scala.rst similarity index 100% rename from src/main/python/mmlspark/doc/scala.rst rename to core/src/main/python/mmlspark/doc/scala.rst diff --git a/src/main/python/mmlspark/downloader/ModelDownloader.py b/core/src/main/python/mmlspark/downloader/ModelDownloader.py similarity index 100% rename from src/main/python/mmlspark/downloader/ModelDownloader.py rename to core/src/main/python/mmlspark/downloader/ModelDownloader.py diff --git a/src/main/python/mmlspark/cyber/feature/__init__.py b/core/src/main/python/mmlspark/downloader/__init__.py similarity index 100% rename from src/main/python/mmlspark/cyber/feature/__init__.py rename to core/src/main/python/mmlspark/downloader/__init__.py diff --git a/src/main/python/mmlspark/io/IOImplicits.py b/core/src/main/python/mmlspark/io/IOImplicits.py similarity index 100% rename from src/main/python/mmlspark/io/IOImplicits.py rename to core/src/main/python/mmlspark/io/IOImplicits.py diff --git a/src/main/python/mmlspark/cyber/utils/__init__.py b/core/src/main/python/mmlspark/io/__init__.py similarity index 100% rename from src/main/python/mmlspark/cyber/utils/__init__.py rename to core/src/main/python/mmlspark/io/__init__.py diff --git a/src/main/python/mmlspark/io/binary/BinaryFileReader.py b/core/src/main/python/mmlspark/io/binary/BinaryFileReader.py similarity index 100% rename from src/main/python/mmlspark/io/binary/BinaryFileReader.py rename to core/src/main/python/mmlspark/io/binary/BinaryFileReader.py diff --git a/src/main/python/mmlspark/downloader/__init__.py b/core/src/main/python/mmlspark/io/binary/__init__.py similarity index 100% rename from src/main/python/mmlspark/downloader/__init__.py rename to core/src/main/python/mmlspark/io/binary/__init__.py diff --git a/src/main/python/mmlspark/io/http/HTTPFunctions.py b/core/src/main/python/mmlspark/io/http/HTTPFunctions.py similarity index 100% rename from src/main/python/mmlspark/io/http/HTTPFunctions.py rename to core/src/main/python/mmlspark/io/http/HTTPFunctions.py diff --git a/src/main/python/mmlspark/io/http/JSONOutputParser.py b/core/src/main/python/mmlspark/io/http/JSONOutputParser.py similarity index 100% rename from src/main/python/mmlspark/io/http/JSONOutputParser.py rename to core/src/main/python/mmlspark/io/http/JSONOutputParser.py diff --git a/src/main/python/mmlspark/io/http/ServingFunctions.py b/core/src/main/python/mmlspark/io/http/ServingFunctions.py similarity index 100% rename from src/main/python/mmlspark/io/http/ServingFunctions.py rename to core/src/main/python/mmlspark/io/http/ServingFunctions.py diff --git a/src/main/python/mmlspark/io/http/SimpleHTTPTransformer.py b/core/src/main/python/mmlspark/io/http/SimpleHTTPTransformer.py similarity index 100% rename from src/main/python/mmlspark/io/http/SimpleHTTPTransformer.py rename to core/src/main/python/mmlspark/io/http/SimpleHTTPTransformer.py diff --git a/src/main/python/mmlspark/image/__init__.py b/core/src/main/python/mmlspark/io/http/__init__.py similarity index 100% rename from src/main/python/mmlspark/image/__init__.py rename to core/src/main/python/mmlspark/io/http/__init__.py diff --git a/src/main/python/mmlspark/io/image/ImageUtils.py b/core/src/main/python/mmlspark/io/image/ImageUtils.py similarity index 100% rename from src/main/python/mmlspark/io/image/ImageUtils.py rename to core/src/main/python/mmlspark/io/image/ImageUtils.py diff --git a/src/main/python/mmlspark/io/__init__.py b/core/src/main/python/mmlspark/io/image/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/__init__.py rename to core/src/main/python/mmlspark/io/image/__init__.py diff --git a/src/main/python/mmlspark/io/powerbi/PowerBIWriter.py b/core/src/main/python/mmlspark/io/powerbi/PowerBIWriter.py similarity index 100% rename from src/main/python/mmlspark/io/powerbi/PowerBIWriter.py rename to core/src/main/python/mmlspark/io/powerbi/PowerBIWriter.py diff --git a/src/main/python/mmlspark/io/binary/__init__.py b/core/src/main/python/mmlspark/io/powerbi/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/binary/__init__.py rename to core/src/main/python/mmlspark/io/powerbi/__init__.py diff --git a/src/main/python/mmlspark/nn/ConditionalBallTree.py b/core/src/main/python/mmlspark/nn/ConditionalBallTree.py similarity index 100% rename from src/main/python/mmlspark/nn/ConditionalBallTree.py rename to core/src/main/python/mmlspark/nn/ConditionalBallTree.py diff --git a/src/main/python/mmlspark/io/http/__init__.py b/core/src/main/python/mmlspark/nn/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/http/__init__.py rename to core/src/main/python/mmlspark/nn/__init__.py diff --git a/src/main/python/mmlspark/io/image/__init__.py b/core/src/main/python/mmlspark/plot/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/image/__init__.py rename to core/src/main/python/mmlspark/plot/__init__.py diff --git a/src/main/python/mmlspark/plot/plot.py b/core/src/main/python/mmlspark/plot/plot.py similarity index 100% rename from src/main/python/mmlspark/plot/plot.py rename to core/src/main/python/mmlspark/plot/plot.py diff --git a/src/main/python/mmlspark/recommendation/RankingTrainValidationSplit.py b/core/src/main/python/mmlspark/recommendation/RankingTrainValidationSplit.py similarity index 100% rename from src/main/python/mmlspark/recommendation/RankingTrainValidationSplit.py rename to core/src/main/python/mmlspark/recommendation/RankingTrainValidationSplit.py diff --git a/src/main/python/mmlspark/recommendation/RankingTrainValidationSplitModel.py b/core/src/main/python/mmlspark/recommendation/RankingTrainValidationSplitModel.py similarity index 100% rename from src/main/python/mmlspark/recommendation/RankingTrainValidationSplitModel.py rename to core/src/main/python/mmlspark/recommendation/RankingTrainValidationSplitModel.py diff --git a/src/main/python/mmlspark/recommendation/SARModel.py b/core/src/main/python/mmlspark/recommendation/SARModel.py similarity index 100% rename from src/main/python/mmlspark/recommendation/SARModel.py rename to core/src/main/python/mmlspark/recommendation/SARModel.py diff --git a/src/main/python/mmlspark/recommendation/__init__.py b/core/src/main/python/mmlspark/recommendation/__init__.py similarity index 100% rename from src/main/python/mmlspark/recommendation/__init__.py rename to core/src/main/python/mmlspark/recommendation/__init__.py diff --git a/src/main/python/mmlspark/stages/UDFTransformer.py b/core/src/main/python/mmlspark/stages/UDFTransformer.py similarity index 100% rename from src/main/python/mmlspark/stages/UDFTransformer.py rename to core/src/main/python/mmlspark/stages/UDFTransformer.py diff --git a/src/main/python/mmlspark/io/powerbi/__init__.py b/core/src/main/python/mmlspark/stages/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/powerbi/__init__.py rename to core/src/main/python/mmlspark/stages/__init__.py diff --git a/src/main/scala/com/microsoft/ml/spark/automl/DefaultHyperparams.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/DefaultHyperparams.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/DefaultHyperparams.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/DefaultHyperparams.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/EvaluationUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/EvaluationUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/EvaluationUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/EvaluationUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.txt b/core/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.txt rename to core/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.txt diff --git a/src/main/scala/com/microsoft/ml/spark/automl/HyperparamBuilder.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/HyperparamBuilder.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/HyperparamBuilder.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/HyperparamBuilder.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/ParamSpace.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/ParamSpace.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/ParamSpace.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/ParamSpace.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.txt b/core/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.txt rename to core/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.txt diff --git a/core/src/main/scala/com/microsoft/ml/spark/codegen/CodeGen.scala b/core/src/main/scala/com/microsoft/ml/spark/codegen/CodeGen.scala new file mode 100644 index 00000000000..bd88735e5f0 --- /dev/null +++ b/core/src/main/scala/com/microsoft/ml/spark/codegen/CodeGen.scala @@ -0,0 +1,202 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.codegen + +import java.io.File + +import com.microsoft.ml.spark.codegen.CodegenConfigProtocol._ +import com.microsoft.ml.spark.core.env.FileUtilities._ +import org.apache.commons.io.FileUtils +import org.apache.commons.io.FilenameUtils._ +import com.microsoft.ml.spark.core.utils.JarLoadingUtils.instantiateServices +import spray.json._ + +object CodeGenUtils { + def clean(dir: File): Unit = if (dir.exists()) FileUtils.forceDelete(dir) + + def toDir(f: File): File = new File(f, File.separator) +} + + +object CodeGen { + + import CodeGenUtils._ + + def generatePythonClasses(conf: CodegenConfig): Unit = { + val instantiatedClasses = instantiateServices[PythonWrappable](conf.jarName) + instantiatedClasses.foreach { w => + println(w.getClass.getName) + w.makePyFile(conf) + } + } + + def generateRClasses(conf: CodegenConfig): Unit = { + val instantiatedClasses = instantiateServices[RWrappable](conf.jarName) + instantiatedClasses.foreach { w => + println(w.getClass.getName) + w.makeRFile(conf) + } + } + + private def makeInitFiles(conf: CodegenConfig, packageFolder: String = ""): Unit = { + val dir = new File(new File(conf.pySrcDir, "mmlspark"), packageFolder) + val packageString = if (packageFolder != "") packageFolder.replace("/", ".") else "" + val importStrings = + dir.listFiles.filter(_.isFile).sorted + .map(_.getName) + .filter(name => name.endsWith(".py") && !name.startsWith("_") && !name.startsWith("test")) + .map(name => s"from mmlspark$packageString.${getBaseName(name)} import *\n").mkString("") + val initFile = new File(dir, "__init__.py") + if (packageFolder != "") { + writeFile(initFile, conf.packageHelp(importStrings)) + } else if (initFile.exists()) { + initFile.delete() + } + dir.listFiles().filter(_.isDirectory).foreach(f => + makeInitFiles(conf, packageFolder + "/" + f.getName) + ) + } + + //noinspection ScalaStyle + def generateRPackageData(conf: CodegenConfig): Unit = { + // description file; need to encode version as decimal + val today = new java.text.SimpleDateFormat("yyyy-MM-dd") + .format(new java.util.Date()) + + conf.rSrcDir.mkdirs() + writeFile(new File(conf.rSrcDir.getParentFile, "DESCRIPTION"), + s"""|Package: ${conf.name.replace("-",".")} + |Title: Access to MMLSpark via R + |Description: Provides an interface to MMLSpark. + |Version: ${conf.rVersion} + |Date: $today + |Author: Microsoft Corporation + |Maintainer: MMLSpark Team
UnicodeNormalize
takes a dataframe and normalizes the unicode representation.
- */
-class UnicodeNormalize(val uid: String) extends Transformer
- with HasInputCol with HasOutputCol with Wrappable with ComplexParamsWritable with BasicLogging {
- logClass()
-
- def this() = this(Identifiable.randomUID("UnicodeNormalize"))
-
- val form = new Param[String](this, "form", "Unicode normalization form: NFC, NFD, NFKC, NFKD")
-
- /** @group getParam */
- def getForm: String = get(form).getOrElse("NFKD")
-
- /** @group setParam */
- def setForm(value: String): this.type = {
- // check input value
- Normalizer.Form.valueOf(getForm)
-
- set("form", value)
- }
-
- val lower = new BooleanParam(this, "lower", "Lowercase text")
-
- /** @group getParam */
- def getLower: Boolean = get(lower).getOrElse(true)
-
- /** @group setParam */
- def setLower(value: Boolean): this.type = set("lower", value)
-
- /** @param dataset - The input dataset, to be transformed
- * @return The DataFrame that results from column selection
- */
- override def transform(dataset: Dataset[_]): DataFrame = {
- logTransform[DataFrame]({
- val inputIndex = dataset.columns.indexOf(getInputCol)
-
- require(inputIndex != -1, s"Input column $getInputCol does not exist")
-
- val normalizeFunc = (value: String) =>
- if (value == null) null
- else Normalizer.normalize(value, Normalizer.Form.valueOf(getForm))
-
- val f = if (getLower)
- (value: String) => Option(value).map(s => normalizeFunc(s.toLowerCase)).orNull
- else
- normalizeFunc
-
- val textMapper = udf(f)
-
- dataset.withColumn(getOutputCol, textMapper(dataset(getInputCol)).as(getOutputCol))
- })
- }
-
- def transformSchema(schema: StructType): StructType = {
- schema.add(StructField(getOutputCol, StringType))
- }
-
- def copy(extra: ParamMap): UnicodeNormalize = defaultCopy(extra)
-
-}
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.stages
+
+import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer}
+import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap}
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.sql.{DataFrame, Dataset}
+import org.apache.spark.sql.functions.udf
+
+import java.text.Normalizer
+import com.microsoft.ml.spark.codegen.Wrappable
+import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol}
+import com.microsoft.ml.spark.logging.BasicLogging
+import org.apache.spark.sql.types.{StringType, StructField, StructType}
+
+object UnicodeNormalize extends ComplexParamsReadable[UnicodeNormalize]
+
+/** UnicodeNormalize
takes a dataframe and normalizes the unicode representation.
+ */
+class UnicodeNormalize(val uid: String) extends Transformer
+ with HasInputCol with HasOutputCol with Wrappable with ComplexParamsWritable with BasicLogging {
+ logClass()
+
+ def this() = this(Identifiable.randomUID("UnicodeNormalize"))
+
+ val form = new Param[String](this, "form", "Unicode normalization form: NFC, NFD, NFKC, NFKD")
+
+ /** @group getParam */
+ def getForm: String = get(form).getOrElse("NFKD")
+
+ /** @group setParam */
+ def setForm(value: String): this.type = {
+ // check input value
+ Normalizer.Form.valueOf(getForm)
+
+ set("form", value)
+ }
+
+ val lower = new BooleanParam(this, "lower", "Lowercase text")
+
+ /** @group getParam */
+ def getLower: Boolean = get(lower).getOrElse(true)
+
+ /** @group setParam */
+ def setLower(value: Boolean): this.type = set("lower", value)
+
+ /** @param dataset - The input dataset, to be transformed
+ * @return The DataFrame that results from column selection
+ */
+ override def transform(dataset: Dataset[_]): DataFrame = {
+ logTransform[DataFrame]({
+ val inputIndex = dataset.columns.indexOf(getInputCol)
+
+ require(inputIndex != -1, s"Input column $getInputCol does not exist")
+
+ val normalizeFunc = (value: String) =>
+ if (value == null) null
+ else Normalizer.normalize(value, Normalizer.Form.valueOf(getForm))
+
+ val f = if (getLower)
+ (value: String) => Option(value).map(s => normalizeFunc(s.toLowerCase)).orNull
+ else
+ normalizeFunc
+
+ val textMapper = udf(f)
+
+ dataset.withColumn(getOutputCol, textMapper(dataset(getInputCol)).as(getOutputCol))
+ })
+ }
+
+ def transformSchema(schema: StructType): StructType = {
+ schema.add(StructField(getOutputCol, StringType))
+ }
+
+ def copy(extra: ParamMap): UnicodeNormalize = defaultCopy(extra)
+
+}
diff --git a/src/main/scala/com/microsoft/ml/spark/stages/udfs.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/udfs.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/stages/udfs.scala
rename to core/src/main/scala/com/microsoft/ml/spark/stages/udfs.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/train/AutoTrainedModel.scala b/core/src/main/scala/com/microsoft/ml/spark/train/AutoTrainedModel.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/train/AutoTrainedModel.scala
rename to core/src/main/scala/com/microsoft/ml/spark/train/AutoTrainedModel.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/train/AutoTrainer.scala b/core/src/main/scala/com/microsoft/ml/spark/train/AutoTrainer.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/train/AutoTrainer.scala
rename to core/src/main/scala/com/microsoft/ml/spark/train/AutoTrainer.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.scala b/core/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.scala
rename to core/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.txt b/core/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.txt
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.txt
rename to core/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.txt
diff --git a/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.scala b/core/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.scala
rename to core/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.txt b/core/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.txt
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.txt
rename to core/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.txt
diff --git a/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.scala b/core/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.scala
rename to core/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.txt b/core/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.txt
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.txt
rename to core/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.txt
diff --git a/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.scala b/core/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.scala
rename to core/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.txt b/core/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.txt
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.txt
rename to core/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.txt
diff --git a/src/main/scala/org/apache/spark/lightgbm/BlockManagerUtils.scala b/core/src/main/scala/org/apache/spark/injections/BlockManagerUtils.scala
similarity index 92%
rename from src/main/scala/org/apache/spark/lightgbm/BlockManagerUtils.scala
rename to core/src/main/scala/org/apache/spark/injections/BlockManagerUtils.scala
index ee0ba74dd41..6d0564abb4b 100644
--- a/src/main/scala/org/apache/spark/lightgbm/BlockManagerUtils.scala
+++ b/core/src/main/scala/org/apache/spark/injections/BlockManagerUtils.scala
@@ -1,13 +1,14 @@
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.
-package org.apache.spark.lightgbm
+package org.apache.spark.injections
import org.apache.spark.sql.Dataset
import org.apache.spark.storage.BlockManager
object BlockManagerUtils {
/** Returns the block manager from the dataframe's spark context.
+ *
* @param data The dataframe to get the block manager from.
* @return The block manager.
*/
diff --git a/src/main/scala/org/apache/spark/injections/RegressionUtils.scala b/core/src/main/scala/org/apache/spark/injections/RegressionUtils.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/injections/RegressionUtils.scala
rename to core/src/main/scala/org/apache/spark/injections/RegressionUtils.scala
diff --git a/src/main/scala/org/apache/spark/injections/SConf.scala b/core/src/main/scala/org/apache/spark/injections/SConf.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/injections/SConf.scala
rename to core/src/main/scala/org/apache/spark/injections/SConf.scala
diff --git a/src/main/scala/org/apache/spark/injections/UDFUtils.scala b/core/src/main/scala/org/apache/spark/injections/UDFUtils.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/injections/UDFUtils.scala
rename to core/src/main/scala/org/apache/spark/injections/UDFUtils.scala
diff --git a/src/main/scala/org/apache/spark/ml/ComplexParamsSerializer.scala b/core/src/main/scala/org/apache/spark/ml/ComplexParamsSerializer.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/ComplexParamsSerializer.scala
rename to core/src/main/scala/org/apache/spark/ml/ComplexParamsSerializer.scala
diff --git a/src/main/scala/org/apache/spark/ml/LimeNamespaceInjections.scala b/core/src/main/scala/org/apache/spark/ml/LimeNamespaceInjections.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/LimeNamespaceInjections.scala
rename to core/src/main/scala/org/apache/spark/ml/LimeNamespaceInjections.scala
diff --git a/src/main/scala/org/apache/spark/ml/NamespaceInjections.scala b/core/src/main/scala/org/apache/spark/ml/NamespaceInjections.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/NamespaceInjections.scala
rename to core/src/main/scala/org/apache/spark/ml/NamespaceInjections.scala
diff --git a/src/main/scala/org/apache/spark/ml/Ranker.scala b/core/src/main/scala/org/apache/spark/ml/Ranker.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/Ranker.scala
rename to core/src/main/scala/org/apache/spark/ml/Ranker.scala
diff --git a/src/main/scala/org/apache/spark/ml/RegressorUtils.scala b/core/src/main/scala/org/apache/spark/ml/RegressorUtils.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/RegressorUtils.scala
rename to core/src/main/scala/org/apache/spark/ml/RegressorUtils.scala
diff --git a/src/main/scala/org/apache/spark/ml/Serializer.scala b/core/src/main/scala/org/apache/spark/ml/Serializer.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/Serializer.scala
rename to core/src/main/scala/org/apache/spark/ml/Serializer.scala
diff --git a/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.scala b/core/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.scala
rename to core/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.scala
diff --git a/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.txt b/core/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.txt
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.txt
rename to core/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.txt
diff --git a/src/main/scala/org/apache/spark/ml/param/ArrayMapParam.scala b/core/src/main/scala/org/apache/spark/ml/param/ArrayMapParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/ArrayMapParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/ArrayMapParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/ArrayParamMapParam.scala b/core/src/main/scala/org/apache/spark/ml/param/ArrayParamMapParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/ArrayParamMapParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/ArrayParamMapParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/BallTreeParam.scala b/core/src/main/scala/org/apache/spark/ml/param/BallTreeParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/BallTreeParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/BallTreeParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/ByteArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/ByteArrayParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/ByteArrayParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/ByteArrayParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/DataFrameParam.scala b/core/src/main/scala/org/apache/spark/ml/param/DataFrameParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/DataFrameParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/DataFrameParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/DataTypeParam.scala b/core/src/main/scala/org/apache/spark/ml/param/DataTypeParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/DataTypeParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/DataTypeParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/EstimatorArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/EstimatorArrayParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/EstimatorArrayParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/EstimatorArrayParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/EstimatorParam.scala b/core/src/main/scala/org/apache/spark/ml/param/EstimatorParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/EstimatorParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/EstimatorParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/EvaluatorParam.scala b/core/src/main/scala/org/apache/spark/ml/param/EvaluatorParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/EvaluatorParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/EvaluatorParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/JsonEncodableParam.scala b/core/src/main/scala/org/apache/spark/ml/param/JsonEncodableParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/JsonEncodableParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/JsonEncodableParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/MapArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/MapArrayParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/MapArrayParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/MapArrayParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/MapParam.scala b/core/src/main/scala/org/apache/spark/ml/param/MapParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/MapParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/MapParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/ParamSpaceParam.scala b/core/src/main/scala/org/apache/spark/ml/param/ParamSpaceParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/ParamSpaceParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/ParamSpaceParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/PipelineStageParam.scala b/core/src/main/scala/org/apache/spark/ml/param/PipelineStageParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/PipelineStageParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/PipelineStageParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/PythonWrappableParam.scala b/core/src/main/scala/org/apache/spark/ml/param/PythonWrappableParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/PythonWrappableParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/PythonWrappableParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/RWrappableParam.scala b/core/src/main/scala/org/apache/spark/ml/param/RWrappableParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/RWrappableParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/RWrappableParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/TransformerArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/TransformerArrayParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/TransformerArrayParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/TransformerArrayParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/TransformerParam.scala b/core/src/main/scala/org/apache/spark/ml/param/TransformerParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/TransformerParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/TransformerParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/TypedArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/TypedArrayParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/TypedArrayParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/TypedArrayParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/UDFParam.scala b/core/src/main/scala/org/apache/spark/ml/param/UDFParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/UDFParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/UDFParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/UDPyFParam.scala b/core/src/main/scala/org/apache/spark/ml/param/UDPyFParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/UDPyFParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/UDPyFParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/UntypedArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/UntypedArrayParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/UntypedArrayParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/UntypedArrayParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala b/core/src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala
rename to core/src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala
diff --git a/src/main/scala/org/apache/spark/ml/source/image/PatchedImageFileFormat.scala b/core/src/main/scala/org/apache/spark/ml/source/image/PatchedImageFileFormat.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/source/image/PatchedImageFileFormat.scala
rename to core/src/main/scala/org/apache/spark/ml/source/image/PatchedImageFileFormat.scala
diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/DistributedHTTPSource.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/DistributedHTTPSource.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/sql/execution/streaming/DistributedHTTPSource.scala
rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/DistributedHTTPSource.scala
diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/HTTPSource.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/HTTPSource.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/sql/execution/streaming/HTTPSource.scala
rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/HTTPSource.scala
diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/ServingUDFs.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/ServingUDFs.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/sql/execution/streaming/ServingUDFs.scala
rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/ServingUDFs.scala
diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSinkV2.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSinkV2.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSinkV2.scala
rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSinkV2.scala
diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSourceV2.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSourceV2.scala
similarity index 99%
rename from src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSourceV2.scala
rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSourceV2.scala
index ef61d8330df..05b373a5f2f 100644
--- a/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSourceV2.scala
+++ b/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSourceV2.scala
@@ -152,7 +152,7 @@ private[streaming] object DriverServiceUtils {
override def handle(request: HttpExchange): Unit = {
try {
val info = Serialization.read[ServiceInfo](
- IOUtils.toString(request.getRequestBody))
+ IOUtils.toString(request.getRequestBody, "UTF-8"))
HTTPServerUtils.respond(request, HTTPResponseData(
Array(), None,
StatusLineData(null, 200, "Success"),
diff --git a/src/main/scala/org/apache/spark/sql/types/injections/MetadataUtilities.scala b/core/src/main/scala/org/apache/spark/sql/types/injections/MetadataUtilities.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/sql/types/injections/MetadataUtilities.scala
rename to core/src/main/scala/org/apache/spark/sql/types/injections/MetadataUtilities.scala
diff --git a/src/main/scala/org/apache/spark/sql/types/injections/OptimizedCKNNFitting.scala b/core/src/main/scala/org/apache/spark/sql/types/injections/OptimizedCKNNFitting.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/sql/types/injections/OptimizedCKNNFitting.scala
rename to core/src/main/scala/org/apache/spark/sql/types/injections/OptimizedCKNNFitting.scala
diff --git a/src/test/R/testthat.R b/core/src/test/R/testthat.R
similarity index 100%
rename from src/test/R/testthat.R
rename to core/src/test/R/testthat.R
diff --git a/src/test/R/testthat/setup-spark.R b/core/src/test/R/testthat/setup-spark.R
similarity index 100%
rename from src/test/R/testthat/setup-spark.R
rename to core/src/test/R/testthat/setup-spark.R
diff --git a/src/test/R/testthat/test-basic.R b/core/src/test/R/testthat/test-basic.R
similarity index 100%
rename from src/test/R/testthat/test-basic.R
rename to core/src/test/R/testthat/test-basic.R
diff --git a/src/test/python/LICENSE.txt b/core/src/test/python/LICENSE.txt
similarity index 100%
rename from src/test/python/LICENSE.txt
rename to core/src/test/python/LICENSE.txt
diff --git a/src/test/python/MANIFEST.in b/core/src/test/python/MANIFEST.in
similarity index 100%
rename from src/test/python/MANIFEST.in
rename to core/src/test/python/MANIFEST.in
diff --git a/src/main/python/mmlspark/lightgbm/__init__.py b/core/src/test/python/__init__.py
similarity index 100%
rename from src/main/python/mmlspark/lightgbm/__init__.py
rename to core/src/test/python/__init__.py
diff --git a/src/main/python/mmlspark/nn/__init__.py b/core/src/test/python/mmlsparktest/__init__.py
similarity index 100%
rename from src/main/python/mmlspark/nn/__init__.py
rename to core/src/test/python/mmlsparktest/__init__.py
diff --git a/src/main/python/mmlspark/opencv/__init__.py b/core/src/test/python/mmlsparktest/cyber/__init__.py
similarity index 100%
rename from src/main/python/mmlspark/opencv/__init__.py
rename to core/src/test/python/mmlsparktest/cyber/__init__.py
diff --git a/src/main/python/mmlspark/plot/__init__.py b/core/src/test/python/mmlsparktest/cyber/anamoly/__init__.py
similarity index 100%
rename from src/main/python/mmlspark/plot/__init__.py
rename to core/src/test/python/mmlsparktest/cyber/anamoly/__init__.py
diff --git a/src/test/python/mmlsparktest/cyber/anamoly/test_collaborative_filtering.py b/core/src/test/python/mmlsparktest/cyber/anamoly/test_collaborative_filtering.py
similarity index 100%
rename from src/test/python/mmlsparktest/cyber/anamoly/test_collaborative_filtering.py
rename to core/src/test/python/mmlsparktest/cyber/anamoly/test_collaborative_filtering.py
diff --git a/src/test/python/mmlsparktest/cyber/anamoly/test_complement_access.py b/core/src/test/python/mmlsparktest/cyber/anamoly/test_complement_access.py
similarity index 100%
rename from src/test/python/mmlsparktest/cyber/anamoly/test_complement_access.py
rename to core/src/test/python/mmlsparktest/cyber/anamoly/test_complement_access.py
diff --git a/src/test/python/mmlsparktest/cyber/explain_tester.py b/core/src/test/python/mmlsparktest/cyber/explain_tester.py
similarity index 100%
rename from src/test/python/mmlsparktest/cyber/explain_tester.py
rename to core/src/test/python/mmlsparktest/cyber/explain_tester.py
diff --git a/src/main/python/mmlspark/stages/__init__.py b/core/src/test/python/mmlsparktest/cyber/feature/__init__.py
similarity index 100%
rename from src/main/python/mmlspark/stages/__init__.py
rename to core/src/test/python/mmlsparktest/cyber/feature/__init__.py
diff --git a/src/test/python/mmlsparktest/cyber/feature/test_indexers.py b/core/src/test/python/mmlsparktest/cyber/feature/test_indexers.py
similarity index 100%
rename from src/test/python/mmlsparktest/cyber/feature/test_indexers.py
rename to core/src/test/python/mmlsparktest/cyber/feature/test_indexers.py
diff --git a/src/test/python/mmlsparktest/cyber/feature/test_scalers.py b/core/src/test/python/mmlsparktest/cyber/feature/test_scalers.py
similarity index 100%
rename from src/test/python/mmlsparktest/cyber/feature/test_scalers.py
rename to core/src/test/python/mmlsparktest/cyber/feature/test_scalers.py
diff --git a/src/main/python/mmlspark/vw/__init__.py b/core/src/test/python/mmlsparktest/cyber/utils/__init__.py
similarity index 100%
rename from src/main/python/mmlspark/vw/__init__.py
rename to core/src/test/python/mmlsparktest/cyber/utils/__init__.py
diff --git a/src/test/python/mmlsparktest/cyber/utils/test_spark_utils.py b/core/src/test/python/mmlsparktest/cyber/utils/test_spark_utils.py
similarity index 100%
rename from src/test/python/mmlsparktest/cyber/utils/test_spark_utils.py
rename to core/src/test/python/mmlsparktest/cyber/utils/test_spark_utils.py
diff --git a/src/test/__init__.py b/core/src/test/python/mmlsparktest/nn/__init__.py
similarity index 100%
rename from src/test/__init__.py
rename to core/src/test/python/mmlsparktest/nn/__init__.py
diff --git a/src/test/python/mmlsparktest/nn/test_ball_tree.py b/core/src/test/python/mmlsparktest/nn/test_ball_tree.py
similarity index 100%
rename from src/test/python/mmlsparktest/nn/test_ball_tree.py
rename to core/src/test/python/mmlsparktest/nn/test_ball_tree.py
diff --git a/src/test/python/__init__.py b/core/src/test/python/mmlsparktest/recommendation/__init__.py
similarity index 100%
rename from src/test/python/__init__.py
rename to core/src/test/python/mmlsparktest/recommendation/__init__.py
diff --git a/src/test/python/mmlsparktest/recommendation/test_ranking.py b/core/src/test/python/mmlsparktest/recommendation/test_ranking.py
similarity index 100%
rename from src/test/python/mmlsparktest/recommendation/test_ranking.py
rename to core/src/test/python/mmlsparktest/recommendation/test_ranking.py
diff --git a/src/test/python/setup.py b/core/src/test/python/setup.py
similarity index 100%
rename from src/test/python/setup.py
rename to core/src/test/python/setup.py
diff --git a/core/src/test/resources/audio1.txt b/core/src/test/resources/audio1.txt
new file mode 100644
index 00000000000..de9993a6af0
--- /dev/null
+++ b/core/src/test/resources/audio1.txt
@@ -0,0 +1 @@
+Content like data models tests and end points are organized into projects in the custom speech portal. Each project is specific to a domain and country slash language. For example, you may create a project for call centers that use English in the United States to create your first project select the speech to text slash custom speech, then click new project follow the instructions provided by The Wizard to create your project after you've created a project you should see 4 tabs data testing training. And deployment use the links provided in Next steps to learn how to use each tab.
\ No newline at end of file
diff --git a/src/test/resources/benchmarks/benchmarkBasicDataTypes.json b/core/src/test/resources/benchmarks/benchmarkBasicDataTypes.json
similarity index 100%
rename from src/test/resources/benchmarks/benchmarkBasicDataTypes.json
rename to core/src/test/resources/benchmarks/benchmarkBasicDataTypes.json
diff --git a/src/test/resources/benchmarks/benchmarkDate.json b/core/src/test/resources/benchmarks/benchmarkDate.json
similarity index 100%
rename from src/test/resources/benchmarks/benchmarkDate.json
rename to core/src/test/resources/benchmarks/benchmarkDate.json
diff --git a/src/test/resources/benchmarks/benchmarkNoOneHot.json b/core/src/test/resources/benchmarks/benchmarkNoOneHot.json
similarity index 100%
rename from src/test/resources/benchmarks/benchmarkNoOneHot.json
rename to core/src/test/resources/benchmarks/benchmarkNoOneHot.json
diff --git a/src/test/resources/benchmarks/benchmarkOneHot.json b/core/src/test/resources/benchmarks/benchmarkOneHot.json
similarity index 100%
rename from src/test/resources/benchmarks/benchmarkOneHot.json
rename to core/src/test/resources/benchmarks/benchmarkOneHot.json
diff --git a/src/test/resources/benchmarks/benchmarkString.json b/core/src/test/resources/benchmarks/benchmarkString.json
similarity index 100%
rename from src/test/resources/benchmarks/benchmarkString.json
rename to core/src/test/resources/benchmarks/benchmarkString.json
diff --git a/src/test/resources/benchmarks/benchmarkStringIndexOneHot.json b/core/src/test/resources/benchmarks/benchmarkStringIndexOneHot.json
similarity index 100%
rename from src/test/resources/benchmarks/benchmarkStringIndexOneHot.json
rename to core/src/test/resources/benchmarks/benchmarkStringIndexOneHot.json
diff --git a/src/test/resources/benchmarks/benchmarkStringMissing.json b/core/src/test/resources/benchmarks/benchmarkStringMissing.json
similarity index 100%
rename from src/test/resources/benchmarks/benchmarkStringMissing.json
rename to core/src/test/resources/benchmarks/benchmarkStringMissing.json
diff --git a/src/test/resources/benchmarks/benchmarkVectors.json b/core/src/test/resources/benchmarks/benchmarkVectors.json
similarity index 100%
rename from src/test/resources/benchmarks/benchmarkVectors.json
rename to core/src/test/resources/benchmarks/benchmarkVectors.json
diff --git a/src/test/resources/benchmarks/benchmarks_VerifyTrainClassifier.csv b/core/src/test/resources/benchmarks/benchmarks_VerifyTrainClassifier.csv
similarity index 100%
rename from src/test/resources/benchmarks/benchmarks_VerifyTrainClassifier.csv
rename to core/src/test/resources/benchmarks/benchmarks_VerifyTrainClassifier.csv
diff --git a/src/test/resources/benchmarks/benchmarks_VerifyTuneHyperparameters.csv b/core/src/test/resources/benchmarks/benchmarks_VerifyTuneHyperparameters.csv
similarity index 100%
rename from src/test/resources/benchmarks/benchmarks_VerifyTuneHyperparameters.csv
rename to core/src/test/resources/benchmarks/benchmarks_VerifyTuneHyperparameters.csv
diff --git a/src/test/resources/demoUsage.csv.gz b/core/src/test/resources/demoUsage.csv.gz
similarity index 100%
rename from src/test/resources/demoUsage.csv.gz
rename to core/src/test/resources/demoUsage.csv.gz
diff --git a/src/test/resources/greyhound.jpg b/core/src/test/resources/greyhound.jpg
similarity index 100%
rename from src/test/resources/greyhound.jpg
rename to core/src/test/resources/greyhound.jpg
diff --git a/src/test/resources/sim_count1.csv.gz b/core/src/test/resources/sim_count1.csv.gz
similarity index 100%
rename from src/test/resources/sim_count1.csv.gz
rename to core/src/test/resources/sim_count1.csv.gz
diff --git a/src/test/resources/sim_count3.csv.gz b/core/src/test/resources/sim_count3.csv.gz
similarity index 100%
rename from src/test/resources/sim_count3.csv.gz
rename to core/src/test/resources/sim_count3.csv.gz
diff --git a/src/test/resources/sim_jac1.csv.gz b/core/src/test/resources/sim_jac1.csv.gz
similarity index 100%
rename from src/test/resources/sim_jac1.csv.gz
rename to core/src/test/resources/sim_jac1.csv.gz
diff --git a/src/test/resources/sim_jac3.csv.gz b/core/src/test/resources/sim_jac3.csv.gz
similarity index 100%
rename from src/test/resources/sim_jac3.csv.gz
rename to core/src/test/resources/sim_jac3.csv.gz
diff --git a/src/test/resources/sim_lift1.csv.gz b/core/src/test/resources/sim_lift1.csv.gz
similarity index 100%
rename from src/test/resources/sim_lift1.csv.gz
rename to core/src/test/resources/sim_lift1.csv.gz
diff --git a/src/test/resources/sim_lift3.csv.gz b/core/src/test/resources/sim_lift3.csv.gz
similarity index 100%
rename from src/test/resources/sim_lift3.csv.gz
rename to core/src/test/resources/sim_lift3.csv.gz
diff --git a/src/test/resources/user_aff.csv.gz b/core/src/test/resources/user_aff.csv.gz
similarity index 100%
rename from src/test/resources/user_aff.csv.gz
rename to core/src/test/resources/user_aff.csv.gz
diff --git a/src/test/resources/userpred_count3_userid_only.csv.gz b/core/src/test/resources/userpred_count3_userid_only.csv.gz
similarity index 100%
rename from src/test/resources/userpred_count3_userid_only.csv.gz
rename to core/src/test/resources/userpred_count3_userid_only.csv.gz
diff --git a/src/test/resources/userpred_jac3_userid_only.csv.gz b/core/src/test/resources/userpred_jac3_userid_only.csv.gz
similarity index 100%
rename from src/test/resources/userpred_jac3_userid_only.csv.gz
rename to core/src/test/resources/userpred_jac3_userid_only.csv.gz
diff --git a/src/test/resources/userpred_lift3_userid_only.csv.gz b/core/src/test/resources/userpred_lift3_userid_only.csv.gz
similarity index 100%
rename from src/test/resources/userpred_lift3_userid_only.csv.gz
rename to core/src/test/resources/userpred_lift3_userid_only.csv.gz
diff --git a/src/test/scala/com/microsoft/ml/spark/Secrets.scala b/core/src/test/scala/com/microsoft/ml/spark/Secrets.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/Secrets.scala
rename to core/src/test/scala/com/microsoft/ml/spark/Secrets.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/automl/VerifyFindBestModel.scala b/core/src/test/scala/com/microsoft/ml/spark/automl/VerifyFindBestModel.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/automl/VerifyFindBestModel.scala
rename to core/src/test/scala/com/microsoft/ml/spark/automl/VerifyFindBestModel.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/automl/VerifyTuneHyperparameters.scala b/core/src/test/scala/com/microsoft/ml/spark/automl/VerifyTuneHyperparameters.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/automl/VerifyTuneHyperparameters.scala
rename to core/src/test/scala/com/microsoft/ml/spark/automl/VerifyTuneHyperparameters.scala
diff --git a/core/src/test/scala/com/microsoft/ml/spark/codegen/TestGen.scala b/core/src/test/scala/com/microsoft/ml/spark/codegen/TestGen.scala
new file mode 100644
index 00000000000..c0a5b315014
--- /dev/null
+++ b/core/src/test/scala/com/microsoft/ml/spark/codegen/TestGen.scala
@@ -0,0 +1,87 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.codegen
+
+import java.io.File
+
+import com.microsoft.ml.spark.codegen.CodegenConfigProtocol._
+import com.microsoft.ml.spark.core.env.FileUtilities._
+import com.microsoft.ml.spark.core.test.base.TestBase
+import com.microsoft.ml.spark.core.test.fuzzing.PyTestFuzzing
+import com.microsoft.ml.spark.core.utils.JarLoadingUtils.instantiateServices
+import org.apache.commons.io.FileUtils
+import spray.json._
+
+
+object TestGen {
+
+ import CodeGenUtils._
+
+ def generatePythonTests(conf: CodegenConfig): Unit = {
+ instantiateServices[PyTestFuzzing[_]](conf.jarName).foreach { ltc =>
+ try {
+ ltc.makePyTestFile(conf)
+ } catch {
+ case _: NotImplementedError =>
+ println(s"ERROR: Could not generate test for ${ltc.testClassName} because of Complex Parameters")
+ }
+ }
+ }
+
+ private def makeInitFiles(conf: CodegenConfig, packageFolder: String = ""): Unit = {
+ val dir = new File(new File(conf.pyTestDir, "mmlsparktest"), packageFolder)
+ if (!dir.exists()){
+ dir.mkdirs()
+ }
+ writeFile(new File(dir, "__init__.py"), "")
+ dir.listFiles().filter(_.isDirectory).foreach(f =>
+ makeInitFiles(conf, packageFolder + "/" + f.getName)
+ )
+ }
+
+
+ //noinspection ScalaStyle
+ def generatePyPackageData(conf: CodegenConfig): Unit = {
+ if (!conf.pySrcDir.exists()) {
+ conf.pySrcDir.mkdir()
+ }
+ writeFile(join(conf.pyTestDir,"mmlsparktest", "spark.py"),
+ s"""
+ |# Copyright (C) Microsoft Corporation. All rights reserved.
+ |# Licensed under the MIT License. See LICENSE in project root for information.
+ |
+ |from pyspark.sql import SparkSession, SQLContext
+ |import os
+ |import mmlspark
+ |from mmlspark.core import __spark_package_version__
+ |
+ |spark = (SparkSession.builder
+ | .master("local[*]")
+ | .appName("PysparkTests")
+ | .config("spark.jars.packages", "com.microsoft.ml.spark:mmlspark:" + __spark_package_version__)
+ | .config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven")
+ | .config("spark.executor.heartbeatInterval", "60s")
+ | .config("spark.sql.shuffle.partitions", 10)
+ | .config("spark.sql.crossJoin.enabled", "true")
+ | .getOrCreate())
+ |
+ |sc = SQLContext(spark.sparkContext)
+ |
+ |""".stripMargin)
+ }
+
+
+ def main(args: Array[String]): Unit = {
+ val conf = args.head.parseJson.convertTo[CodegenConfig]
+ clean(conf.testDataDir)
+ clean(conf.pyTestDir)
+ generatePythonTests(conf)
+ generatePyPackageData(conf)
+ //TestBase.stopSparkSession()
+ if (toDir(conf.pyTestOverrideDir).exists()){
+ FileUtils.copyDirectoryToDirectory(toDir(conf.pyTestOverrideDir), toDir(conf.pyTestDir))
+ }
+ makeInitFiles(conf)
+ }
+}
diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/HashingTFSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/HashingTFSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/ml/HashingTFSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/HashingTFSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/IDFSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/IDFSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/ml/IDFSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/IDFSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/NGramSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/NGramSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/ml/NGramSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/NGramSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/OneHotEncoderSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/OneHotEncoderSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/ml/OneHotEncoderSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/OneHotEncoderSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/Word2VecSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/Word2VecSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/ml/Word2VecSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/Word2VecSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/schema/SparkBindingsTest.scala b/core/src/test/scala/com/microsoft/ml/spark/core/schema/SparkBindingsTest.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/schema/SparkBindingsTest.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/schema/SparkBindingsTest.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/schema/TestCategoricals.scala b/core/src/test/scala/com/microsoft/ml/spark/core/schema/TestCategoricals.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/schema/TestCategoricals.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/schema/TestCategoricals.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/schema/VerifyFastVectorAssembler.scala b/core/src/test/scala/com/microsoft/ml/spark/core/schema/VerifyFastVectorAssembler.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/schema/VerifyFastVectorAssembler.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/schema/VerifyFastVectorAssembler.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/schema/VerifySparkSchema.scala b/core/src/test/scala/com/microsoft/ml/spark/core/schema/VerifySparkSchema.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/schema/VerifySparkSchema.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/schema/VerifySparkSchema.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/serialize/ValidateComplexParamSerializer.scala b/core/src/test/scala/com/microsoft/ml/spark/core/serialize/ValidateComplexParamSerializer.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/serialize/ValidateComplexParamSerializer.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/serialize/ValidateComplexParamSerializer.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala b/core/src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala
similarity index 79%
rename from src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala
index faaf19398ea..031d1b333e4 100644
--- a/src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala
+++ b/core/src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala
@@ -31,23 +31,27 @@ object SparkSessionFactory {
if (File.separator != "\\") path
else path.replaceFirst("[A-Z]:", "").replace("\\", "/")
}
+
def currentDir(): String = System.getProperty("user.dir")
def getSession(name: String, logLevel: String = "WARN",
numRetries: Int = 1, numCores: Option[Int] = None): SparkSession = {
val cores = numCores.map(_.toString).getOrElse("*")
val conf = new SparkConf()
- .setAppName(name)
- .setMaster(if (numRetries == 1){s"local[$cores]"}else{s"local[$cores, $numRetries]"})
- .set("spark.logConf", "true")
- .set("spark.sql.shuffle.partitions", "20")
- .set("spark.driver.maxResultSize", "6g")
- .set("spark.sql.warehouse.dir", SparkSessionFactory.LocalWarehousePath)
- .set("spark.sql.crossJoin.enabled", "true")
+ .setAppName(name)
+ .setMaster(if (numRetries == 1) {
+ s"local[$cores]"
+ } else {
+ s"local[$cores, $numRetries]"
+ })
+ .set("spark.logConf", "true")
+ .set("spark.sql.shuffle.partitions", "20")
+ .set("spark.driver.maxResultSize", "6g")
+ .set("spark.sql.warehouse.dir", SparkSessionFactory.LocalWarehousePath)
+ .set("spark.sql.crossJoin.enabled", "true")
val sess = SparkSession.builder()
.config(conf)
.getOrCreate()
- sess.sparkContext.setLogLevel(logLevel)
sess
}
diff --git a/src/test/scala/com/microsoft/ml/spark/core/test/base/TestBase.scala b/core/src/test/scala/com/microsoft/ml/spark/core/test/base/TestBase.scala
similarity index 99%
rename from src/test/scala/com/microsoft/ml/spark/core/test/base/TestBase.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/test/base/TestBase.scala
index 097c120581b..84a2bfcd08f 100644
--- a/src/test/scala/com/microsoft/ml/spark/core/test/base/TestBase.scala
+++ b/core/src/test/scala/com/microsoft/ml/spark/core/test/base/TestBase.scala
@@ -3,6 +3,8 @@
package com.microsoft.ml.spark.core.test.base
+import java.nio.file.Files
+
import breeze.linalg.norm.Impl
import breeze.linalg.{norm, DenseVector => BDV}
import breeze.math.Field
@@ -17,7 +19,6 @@ import org.scalatest._
import org.scalatest.concurrent.TimeLimits
import org.scalatest.time.{Seconds, Span}
-import java.nio.file.Files
import scala.concurrent._
import scala.reflect.ClassTag
diff --git a/src/test/scala/com/microsoft/ml/spark/core/test/benchmarks/Benchmarks.scala b/core/src/test/scala/com/microsoft/ml/spark/core/test/benchmarks/Benchmarks.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/test/benchmarks/Benchmarks.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/test/benchmarks/Benchmarks.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala b/core/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala
similarity index 90%
rename from src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala
index 9adbad67236..7c6540c8861 100644
--- a/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala
+++ b/core/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala
@@ -7,15 +7,15 @@ import java.io.File
import java.nio.charset.StandardCharsets
import java.nio.file.Files
-import com.microsoft.ml.spark.codegen.Config
+import com.microsoft.ml.spark.codegen.CodegenConfig
import com.microsoft.ml.spark.core.env.FileUtilities
-import com.microsoft.ml.spark.core.test.base.TestBase
import org.apache.commons.io.FileUtils
import org.apache.spark.ml._
import org.apache.spark.ml.param.{DataFrameEquality, ExternalPythonWrappableParam, ParamPair}
import org.apache.spark.ml.util.{MLReadable, MLWritable}
import org.apache.spark.sql.DataFrame
import com.microsoft.ml.spark.codegen.GenerationUtils._
+import com.microsoft.ml.spark.core.test.base.TestBase
/**
* Class for holding test information, call by name to avoid uneccesary computations in test generations
@@ -50,17 +50,17 @@ trait PyTestFuzzing[S <: PipelineStage] extends TestBase with DataFrameEquality
val testClassName: String = this.getClass.getName.split(".".toCharArray).last
- val testDataDir: File = FileUtilities.join(
- Config.TestDataDir, this.getClass.getName.split(".".toCharArray).last)
+ def testDataDir(conf: CodegenConfig): File = FileUtilities.join(
+ conf.testDataDir, this.getClass.getName.split(".".toCharArray).last)
- def saveDataset(df: DataFrame, name: String): Unit = {
- df.write.mode("overwrite").parquet(new File(testDataDir, s"$name.parquet").toString)
+ def saveDataset(conf: CodegenConfig, df: DataFrame, name: String): Unit = {
+ df.write.mode("overwrite").parquet(new File(testDataDir(conf), s"$name.parquet").toString)
}
- def saveModel(model: S, name: String): Unit = {
+ def saveModel(conf: CodegenConfig, model: S, name: String): Unit = {
model match {
case writable: MLWritable =>
- writable.write.overwrite().save(new File(testDataDir, s"$name.model").toString)
+ writable.write.overwrite().save(new File(testDataDir(conf), s"$name.model").toString)
case _ =>
throw new IllegalArgumentException(s"${model.getClass.getName} is not writable")
}
@@ -69,14 +69,14 @@ trait PyTestFuzzing[S <: PipelineStage] extends TestBase with DataFrameEquality
val testFitting = false
- def saveTestData(): Unit = {
- testDataDir.mkdirs()
+ def saveTestData(conf: CodegenConfig): Unit = {
+ testDataDir(conf).mkdirs()
pyTestObjects().zipWithIndex.foreach { case (to, i) =>
- saveModel(to.stage, s"model-$i")
+ saveModel(conf, to.stage, s"model-$i")
if (testFitting) {
- saveDataset(to.fitDF, s"fit-$i")
- saveDataset(to.transDF, s"trans-$i")
- to.validateDF.foreach(saveDataset(_, s"val-$i"))
+ saveDataset(conf, to.fitDF, s"fit-$i")
+ saveDataset(conf, to.transDF, s"trans-$i")
+ to.validateDF.foreach(saveDataset(conf, _, s"val-$i"))
}
}
}
@@ -144,9 +144,9 @@ trait PyTestFuzzing[S <: PipelineStage] extends TestBase with DataFrameEquality
}
- def makePyTestFile(): Unit = {
+ def makePyTestFile(conf: CodegenConfig): Unit = {
spark
- saveTestData()
+ saveTestData(conf)
val generatedTests = pyTestObjects().zipWithIndex.map { case (to, i) => makePyTests(to, i) }
val stage = pyTestObjects().head.stage
val stageName = stage.getClass.getName.split(".".toCharArray).last
@@ -159,7 +159,7 @@ trait PyTestFuzzing[S <: PipelineStage] extends TestBase with DataFrameEquality
|from os.path import join
|import json
|
- |test_data_dir = "${testDataDir.toString.replaceAllLiterally("\\", "\\\\")}"
+ |test_data_dir = "${testDataDir(conf).toString.replaceAllLiterally("\\", "\\\\")}"
|
|
|class $testClassName(unittest.TestCase):
@@ -180,7 +180,7 @@ trait PyTestFuzzing[S <: PipelineStage] extends TestBase with DataFrameEquality
val testFolders = importPath.mkString(".")
.replaceAllLiterally("com.microsoft.ml.spark", "mmlsparktest").split(".".toCharArray)
- val testDir = FileUtilities.join((Seq(Config.PyTestDir.toString) ++ testFolders.toSeq): _*)
+ val testDir = FileUtilities.join((Seq(conf.pyTestDir.toString) ++ testFolders.toSeq): _*)
testDir.mkdirs()
Files.write(
FileUtilities.join(testDir, "test_" + camelToSnake(testClassName) + ".py").toPath,
diff --git a/src/test/scala/com/microsoft/ml/spark/core/utils/VerifyClusterUtil.scala b/core/src/test/scala/com/microsoft/ml/spark/core/utils/VerifyClusterUtil.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/utils/VerifyClusterUtil.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/utils/VerifyClusterUtil.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/split1/HasExplainTargetSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/HasExplainTargetSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/explainers/split1/HasExplainTargetSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/explainers/split1/HasExplainTargetSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/split1/KernelSHAPSamplerSupportSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/KernelSHAPSamplerSupportSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/explainers/split1/KernelSHAPSamplerSupportSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/explainers/split1/KernelSHAPSamplerSupportSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/split1/LassoRegressionSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/LassoRegressionSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/explainers/split1/LassoRegressionSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/explainers/split1/LassoRegressionSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/split1/LeastSquaresRegressionSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/LeastSquaresRegressionSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/explainers/split1/LeastSquaresRegressionSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/explainers/split1/LeastSquaresRegressionSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/split1/SamplerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/SamplerSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/explainers/split1/SamplerSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/explainers/split1/SamplerSuite.scala
index 4606bedcf70..0c4ea711ed0 100644
--- a/src/test/scala/com/microsoft/ml/spark/explainers/split1/SamplerSuite.scala
+++ b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/SamplerSuite.scala
@@ -6,7 +6,6 @@ package com.microsoft.ml.spark.explainers.split1
import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV}
import breeze.stats.distributions.RandBasis
import breeze.stats.{mean, stddev}
-import com.microsoft.ml.spark.core.test.base.TestBase
import com.microsoft.ml.spark.explainers.BreezeUtils._
import com.microsoft.ml.spark.explainers._
import com.microsoft.ml.spark.io.image.ImageUtils
@@ -17,8 +16,9 @@ import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types._
import org.scalactic.{Equality, TolerantNumerics}
import org.scalatest.Matchers._
-
import java.nio.file.{Files, Paths}
+
+import com.microsoft.ml.spark.core.test.base.TestBase
import javax.imageio.ImageIO
class SamplerSuite extends TestBase {
diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/split1/TabularLIMEExplainerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/TabularLIMEExplainerSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/explainers/split1/TabularLIMEExplainerSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/explainers/split1/TabularLIMEExplainerSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/split1/TabularSHAPExplainerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/TabularSHAPExplainerSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/explainers/split1/TabularSHAPExplainerSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/explainers/split1/TabularSHAPExplainerSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/split1/TextExplainersSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/TextExplainersSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/explainers/split1/TextExplainersSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/explainers/split1/TextExplainersSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/split1/VectorLIMEExplainerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/VectorLIMEExplainerSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/explainers/split1/VectorLIMEExplainerSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/explainers/split1/VectorLIMEExplainerSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/split1/VectorSHAPExplainerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/VectorSHAPExplainerSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/explainers/split1/VectorSHAPExplainerSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/explainers/split1/VectorSHAPExplainerSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCleanMissingData.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCleanMissingData.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyCleanMissingData.scala
rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCleanMissingData.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCountSelector.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCountSelector.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyCountSelector.scala
rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCountSelector.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyDataConversion.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyDataConversion.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyDataConversion.scala
rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyDataConversion.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyFeaturize.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyFeaturize.scala
similarity index 99%
rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyFeaturize.scala
rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyFeaturize.scala
index 7ff30e5c723..72168f2badc 100644
--- a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyFeaturize.scala
+++ b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyFeaturize.scala
@@ -13,7 +13,7 @@ import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject}
import org.apache.commons.io.FileUtils
import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.feature.StringIndexer
-import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vectors, Vector}
+import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql._
diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyValueIndexer.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyValueIndexer.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyValueIndexer.scala
rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyValueIndexer.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/text/MultiNGramSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/text/MultiNGramSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/featurize/text/MultiNGramSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/featurize/text/MultiNGramSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/text/PageSplitterSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/text/PageSplitterSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/featurize/text/PageSplitterSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/featurize/text/PageSplitterSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizerSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizerSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizerSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizerSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/flaky/PartitionConsolidatorSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/flaky/PartitionConsolidatorSuite.scala
similarity index 93%
rename from src/test/scala/com/microsoft/ml/spark/flaky/PartitionConsolidatorSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/flaky/PartitionConsolidatorSuite.scala
index 197f85a6fb5..9c014e715a8 100644
--- a/src/test/scala/com/microsoft/ml/spark/flaky/PartitionConsolidatorSuite.scala
+++ b/core/src/test/scala/com/microsoft/ml/spark/flaky/PartitionConsolidatorSuite.scala
@@ -3,13 +3,13 @@
package com.microsoft.ml.spark.flaky
-import com.microsoft.ml.spark.core.test.base.{SparkSessionFactory, TestBase, TimeLimitedFlaky}
+import com.microsoft.ml.spark.core.test.base.{TestBase, TimeLimitedFlaky}
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import com.microsoft.ml.spark.stages.PartitionConsolidator
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types.{DoubleType, StructType}
-import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.scalatest.Assertion
class PartitionConsolidatorSuite extends TransformerFuzzing[PartitionConsolidator] with TimeLimitedFlaky {
diff --git a/core/src/test/scala/com/microsoft/ml/spark/image/ImageTestUtils.scala b/core/src/test/scala/com/microsoft/ml/spark/image/ImageTestUtils.scala
new file mode 100644
index 00000000000..63dbea62576
--- /dev/null
+++ b/core/src/test/scala/com/microsoft/ml/spark/image/ImageTestUtils.scala
@@ -0,0 +1,109 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.image
+
+import java.io.File
+import java.net.URL
+
+import com.microsoft.ml.spark.build.BuildInfo
+import com.microsoft.ml.spark.core.env.FileUtilities
+import com.microsoft.ml.spark.core.test.base.TestBase
+import org.apache.spark.ml.linalg.DenseVector
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import com.microsoft.ml.spark.io.IOImplicits.dfrToDfre
+import org.apache.commons.io.FileUtils
+import org.apache.spark.sql.functions.col
+
+trait ImageTestUtils extends TestBase {
+
+ val filesRoot = BuildInfo.datasetDir.toString
+ val imagePath = FileUtilities.join(filesRoot, "Images", "CIFAR").toString
+ val modelPath = FileUtilities.join(filesRoot, "CNTKModel", "ConvNet_CIFAR10.model").toString
+ val inputCol = "cntk_images"
+ val outputCol = "out"
+ val labelCol = "labels"
+
+ val featureVectorLength = 3 * 32 * 32
+ lazy val saveFile = new File(tmpDir.toFile, "spark-z.model").toString
+
+ def testModelDF(spark: SparkSession): DataFrame = {
+ import spark.implicits._
+ spark.sparkContext.parallelize(Seq(
+ Array(1.32165250, -2.1215112, 0.63150704, 0.77315974, -1.28163720,
+ -0.20210080, -2.2839167, -2.08691480, 5.08418200, -1.33741090),
+ Array(3.44079640, 1.4877119, -0.74059330, -0.34381202, -2.48724990,
+ -2.62866950, -3.1693816, -3.14182600, 4.76314800, 0.68712880),
+ Array(-1.88747900, -4.7685330, 0.15169683, 6.80547570, -0.38405967,
+ 3.41065170, 1.3302778, -0.87714905, -2.18046050, -4.16661830),
+ Array(5.01010300, 3.9860306, -1.36795600, -0.89830830, -4.49545430,
+ -4.19537070, -4.4045380, -5.81759450, 6.93805700, 1.49001510),
+ Array(-4.70754600, -6.0414960, 1.20658250, 5.40738300, 1.07661690,
+ 4.71566440, 4.3834330, -1.57187440, -2.96569730, -5.43208270),
+ Array(-1.23873880, -3.2042341, 2.54533000, 5.51954800, 2.89042470,
+ 0.12380804, 3.8639085, -4.79466800, -2.41463420, -5.17418430))).toDF
+ }
+
+ def testImages(spark: SparkSession): DataFrame = {
+ val images = spark.read.image.load(imagePath)
+
+ val unroll = new UnrollImage().setInputCol("image").setOutputCol(inputCol)
+
+ unroll.transform(images).select(inputCol)
+ }
+
+ def makeFakeData(spark: SparkSession, rows: Int, size: Int, outputDouble: Boolean = false): DataFrame = {
+ import spark.implicits._
+ if (outputDouble) {
+ List
+ .fill(rows)(List.fill(size)(0.0).toArray)
+ .zip(List.fill(rows)(0.0))
+ .toDF(inputCol, labelCol)
+ } else {
+ List
+ .fill(rows)(List.fill(size)(0.0.toFloat).toArray)
+ .zip(List.fill(rows)(0.0))
+ .toDF(inputCol, labelCol)
+ }
+ }
+
+ protected def compareToTestModel(result: DataFrame) = {
+ //TODO improve checks
+ assert(result.columns.toSet == Set(inputCol, outputCol))
+ assert(result.count() == testModelDF(result.sparkSession).count())
+ val max = result
+ .select(outputCol)
+ .collect()
+ .map(row => row.getAs[DenseVector](0).toArray.max)
+ .max
+ assert(max < 10 & max > -10)
+ }
+
+ lazy val images: DataFrame = spark.read.image.load(imagePath)
+ .withColumnRenamed("image", inputCol)
+ lazy val binaryImages: DataFrame = spark.read.binary.load(imagePath)
+ .select(col("value.bytes").alias(inputCol))
+
+ lazy val groceriesPath = FileUtilities.join(BuildInfo.datasetDir, "Images","Grocery")
+ lazy val groceryImages: DataFrame = spark.read.image
+ .option("dropInvalid", true)
+ .load(groceriesPath + "**")
+ .withColumnRenamed("image", inputCol)
+
+ lazy val greyscaleImageLocation: String = {
+ val loc = "/tmp/greyscale.jpg"
+ val f = new File(loc)
+ if (f.exists()) {f.delete()}
+ FileUtils.copyURLToFile(new URL("https://mmlspark.blob.core.windows.net/datasets/LIME/greyscale.jpg"), f)
+ loc
+ }
+
+ lazy val greyscaleImage: DataFrame = spark
+ .read.image.load(greyscaleImageLocation)
+ .select(col("image").alias(inputCol))
+
+ lazy val greyscaleBinary: DataFrame = spark
+ .read.binary.load(greyscaleImageLocation)
+ .select(col("value.bytes").alias(inputCol))
+
+}
diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/BinaryFileReaderSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/BinaryFileReaderSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/io/split1/BinaryFileReaderSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/BinaryFileReaderSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/HTTPTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/HTTPTransformerSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/io/split1/HTTPTransformerSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/HTTPTransformerSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala
similarity index 99%
rename from src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala
index 13592cec90b..b611ef5158e 100644
--- a/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala
+++ b/core/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala
@@ -5,7 +5,7 @@ package com.microsoft.ml.spark.io.split1
import java.io.{File, FileInputStream}
-import com.microsoft.ml.spark.cognitive.OsUtils
+import com.microsoft.ml.spark.core.utils.OsUtils
import com.microsoft.ml.spark.core.env.FileUtilities
import com.microsoft.ml.spark.core.schema.ImageSchemaUtils
import com.microsoft.ml.spark.core.test.base.TestBase
diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/ParserSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/ParserSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/io/split1/ParserSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/ParserSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/PowerBiSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/PowerBiSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/io/split1/PowerBiSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/PowerBiSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/SimpleHTTPTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/SimpleHTTPTransformerSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/io/split1/SimpleHTTPTransformerSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/SimpleHTTPTransformerSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/io/split2/ContinuousHTTPSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split2/ContinuousHTTPSuite.scala
similarity index 97%
rename from src/test/scala/com/microsoft/ml/spark/io/split2/ContinuousHTTPSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/io/split2/ContinuousHTTPSuite.scala
index 5507196ee7b..40cf3936191 100644
--- a/src/test/scala/com/microsoft/ml/spark/io/split2/ContinuousHTTPSuite.scala
+++ b/core/src/test/scala/com/microsoft/ml/spark/io/split2/ContinuousHTTPSuite.scala
@@ -5,7 +5,6 @@ package com.microsoft.ml.spark.io.split2
import java.io.File
import java.util.UUID
-import java.util.concurrent.TimeUnit
import com.microsoft.ml.spark.core.test.base.{Flaky, TestBase}
import com.microsoft.ml.spark.io.IOImplicits._
@@ -15,7 +14,6 @@ import org.apache.spark.sql.streaming.{DataStreamReader, StreamingQuery, Trigger
import org.apache.spark.sql.types.BinaryType
import scala.concurrent.Await
-import scala.concurrent.duration.Duration
// scalastyle:off magic.number
class ContinuousHTTPSuite extends TestBase with Flaky with HTTPTestUtils {
diff --git a/src/test/scala/com/microsoft/ml/spark/io/split2/DistributedHTTPSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split2/DistributedHTTPSuite.scala
similarity index 99%
rename from src/test/scala/com/microsoft/ml/spark/io/split2/DistributedHTTPSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/io/split2/DistributedHTTPSuite.scala
index 5dd5b437408..d5d106315b8 100644
--- a/src/test/scala/com/microsoft/ml/spark/io/split2/DistributedHTTPSuite.scala
+++ b/core/src/test/scala/com/microsoft/ml/spark/io/split2/DistributedHTTPSuite.scala
@@ -354,7 +354,7 @@ class DistributedHTTPSuite extends TestBase with Flaky with HTTPTestUtils {
processes.foreach { p =>
p.waitFor
- val error = IOUtils.toString(p.getErrorStream)
+ val error = IOUtils.toString(p.getErrorStream, "UTF-8")
assert(error === "")
}
}
diff --git a/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/io/split2/HTTPSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPv2Suite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPv2Suite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/io/split2/HTTPv2Suite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPv2Suite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala b/core/src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala
similarity index 98%
rename from src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala
rename to core/src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala
index e623605967e..2ee5fd153e2 100644
--- a/src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala
+++ b/core/src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala
@@ -86,10 +86,6 @@ class VerifyIsolationForest extends Benchmarks with EstimatorFuzzing[IsolationFo
data
}
- test("foo"){
- new IsolationForest().makePyFile()
- }
-
override def reader: MLReadable[_] = IsolationForest
override def modelReader: MLReadable[_] = IsolationForestModel
diff --git a/core/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala b/core/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala
new file mode 100644
index 00000000000..b58e597944b
--- /dev/null
+++ b/core/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala
@@ -0,0 +1,66 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.lime
+
+import breeze.linalg.{*, DenseMatrix}
+import breeze.stats.distributions.Rand
+import com.microsoft.ml.spark.core.test.base.TestBase
+import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject, TransformerFuzzing}
+import org.apache.spark.ml.linalg.DenseVector
+import org.apache.spark.ml.param.DataFrameEquality
+import org.apache.spark.ml.regression.LinearRegression
+import org.apache.spark.ml.util.MLReadable
+
+trait LimeTestBase extends TestBase {
+
+ import spark.implicits._
+
+ lazy val nRows = 100
+ lazy val d1 = 3
+ lazy val d2 = 1
+
+ lazy val m: DenseMatrix[Double] = new DenseMatrix(d1, d2, Array(1.0, -1.0, 2.0))
+ lazy val x: DenseMatrix[Double] = DenseMatrix.rand(nRows, d1, Rand.gaussian)
+ lazy val noise: DenseMatrix[Double] = DenseMatrix.rand(nRows, d2, Rand.gaussian) * 0.1
+ lazy val y = x * m //+ noise
+
+ lazy val xRows = x(*, ::).iterator.toSeq.map(dv => new DenseVector(dv.toArray))
+ lazy val yRows = y(*, ::).iterator.toSeq.map(dv => dv(0))
+ lazy val df = xRows.zip(yRows).toDF("features", "label")
+
+ lazy val model = new LinearRegression().fit(df)
+
+ lazy val lime = new TabularLIME()
+ .setModel(model)
+ .setInputCol("features")
+ .setPredictionCol(model.getPredictionCol)
+ .setOutputCol("out")
+ .setNSamples(1000)
+
+ lazy val limeModel = lime.fit(df)
+}
+
+class TabularLIMESuite extends EstimatorFuzzing[TabularLIME] with
+ DataFrameEquality with LimeTestBase {
+
+ test("text lime usage test check") {
+ val results = limeModel.transform(df).select("out")
+ .collect().map(_.getAs[DenseVector](0))
+ results.foreach(result => assert(result === new DenseVector(m.data)))
+ }
+
+ override def testObjects(): Seq[TestObject[TabularLIME]] = Seq(new TestObject(lime, df))
+
+ override def reader: MLReadable[_] = TabularLIME
+
+ override def modelReader: MLReadable[_] = TabularLIMEModel
+}
+
+class TabularLIMEModelSuite extends TransformerFuzzing[TabularLIMEModel] with
+ DataFrameEquality with LimeTestBase {
+
+ override def testObjects(): Seq[TestObject[TabularLIMEModel]] = Seq(new TestObject(limeModel, df))
+
+ override def reader: MLReadable[_] = TabularLIMEModel
+}
diff --git a/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala
similarity index 96%
rename from src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala
index 5d2c26e330f..289720f9691 100644
--- a/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala
+++ b/core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala
@@ -7,13 +7,13 @@ import java.awt.Color
import java.awt.image.BufferedImage
import java.io.File
-import com.microsoft.ml.spark.cntk.CNTKTestUtils
+import com.microsoft.ml.spark.image.ImageTestUtils
import com.microsoft.ml.spark.io.image.ImageUtils
import javax.imageio.ImageIO
import scala.util.Random
-class SuperpixelSuite extends CNTKTestUtils {
+class SuperpixelSuite extends ImageTestUtils {
lazy val sp1 = new Superpixel(img, 16, 130)
lazy val sp2 = new Superpixel(img2, 100, 130)
diff --git a/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala
similarity index 90%
rename from src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala
index 881aefed41a..0c4a5b78d0b 100644
--- a/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala
+++ b/core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala
@@ -4,12 +4,12 @@
package com.microsoft.ml.spark.lime
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
-import com.microsoft.ml.spark.image.NetworkUtils
+import com.microsoft.ml.spark.image.ImageTestUtils
import com.microsoft.ml.spark.io.split1.FileReaderUtils
import org.apache.spark.ml.util.MLReadable
class SuperpixelTransformerSuite extends TransformerFuzzing[SuperpixelTransformer]
- with NetworkUtils with FileReaderUtils {
+ with ImageTestUtils with FileReaderUtils {
lazy val spt: SuperpixelTransformer = new SuperpixelTransformer().setInputCol(inputCol)
test("basic functionality"){
diff --git a/src/test/scala/com/microsoft/ml/spark/lime/TextLIMESuite.scala b/core/src/test/scala/com/microsoft/ml/spark/lime/TextLIMESuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/lime/TextLIMESuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/lime/TextLIMESuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala b/core/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala
similarity index 96%
rename from src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala
rename to core/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala
index 7ce9ba7e569..bda6857db7c 100644
--- a/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala
+++ b/core/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala
@@ -42,9 +42,7 @@ object DatabricksUtilities extends HasHttpClient {
val Folder = s"/MMLSparkBuild/build_${BuildInfo.version}"
// MMLSpark info
- val TruncatedScalaVersion: String = BuildInfo.scalaVersion
- .split(".".toCharArray.head).dropRight(1).mkString(".")
- val Version = s"com.microsoft.ml.spark:${BuildInfo.name}_$TruncatedScalaVersion:${BuildInfo.version}"
+ val Version = s"com.microsoft.ml.spark:mmlspark:${BuildInfo.version}"
val Repository = "https://mmlspark.azureedge.net/maven"
val Libraries: String = List(
@@ -59,7 +57,7 @@ object DatabricksUtilities extends HasHttpClient {
val TimeoutInMillis: Int = 40 * 60 * 1000
val NotebookFiles: Array[File] = Option(
- FileUtilities.join(BuildInfo.baseDirectory, "notebooks", "samples").getCanonicalFile.listFiles()
+ FileUtilities.join(BuildInfo.baseDirectory.getParent, "notebooks").getCanonicalFile.listFiles()
).get
val ParallizableNotebooks = NotebookFiles.filterNot(_.getName.contains("Vowpal"))
@@ -88,7 +86,7 @@ object DatabricksUtilities extends HasHttpClient {
if (response.getStatusLine.getStatusCode != 200) {
throw new RuntimeException(s"Failed: response: $response")
}
- IOUtils.toString(response.getEntity.getContent).parseJson
+ IOUtils.toString(response.getEntity.getContent, "UTF-8").parseJson
}.get
})
}
@@ -104,7 +102,7 @@ object DatabricksUtilities extends HasHttpClient {
val entity = IOUtils.toString(response.getEntity.getContent, "UTF-8")
throw new RuntimeException(s"Failed:\n entity:$entity \n response: $response")
}
- IOUtils.toString(response.getEntity.getContent).parseJson
+ IOUtils.toString(response.getEntity.getContent, "UTF-8").parseJson
}.get
})
}
diff --git a/src/test/scala/com/microsoft/ml/spark/nbtest/NotebookTests.scala b/core/src/test/scala/com/microsoft/ml/spark/nbtest/NotebookTests.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/nbtest/NotebookTests.scala
rename to core/src/test/scala/com/microsoft/ml/spark/nbtest/NotebookTests.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/nbtest/SprayUtilities.scala b/core/src/test/scala/com/microsoft/ml/spark/nbtest/SprayUtilities.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/nbtest/SprayUtilities.scala
rename to core/src/test/scala/com/microsoft/ml/spark/nbtest/SprayUtilities.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/nn/BallTreeTest.scala b/core/src/test/scala/com/microsoft/ml/spark/nn/BallTreeTest.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/nn/BallTreeTest.scala
rename to core/src/test/scala/com/microsoft/ml/spark/nn/BallTreeTest.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/nn/ConditionalBallTreeTest.scala b/core/src/test/scala/com/microsoft/ml/spark/nn/ConditionalBallTreeTest.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/nn/ConditionalBallTreeTest.scala
rename to core/src/test/scala/com/microsoft/ml/spark/nn/ConditionalBallTreeTest.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/nn/KNNTest.scala b/core/src/test/scala/com/microsoft/ml/spark/nn/KNNTest.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/nn/KNNTest.scala
rename to core/src/test/scala/com/microsoft/ml/spark/nn/KNNTest.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RankingAdapterSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingAdapterSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/recommendation/RankingAdapterSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingAdapterSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RankingEvaluatorSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingEvaluatorSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/recommendation/RankingEvaluatorSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingEvaluatorSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTestBase.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTestBase.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/recommendation/RankingTestBase.scala
rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTestBase.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexerSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexerSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexerSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexerSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/SARSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/SARSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/recommendation/SARSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/SARSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/BatchIteratorSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/BatchIteratorSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/BatchIteratorSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/BatchIteratorSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/CacherSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/CacherSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/CacherSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/CacherSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/ClassBalancerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/ClassBalancerSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/ClassBalancerSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/ClassBalancerSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala
similarity index 92%
rename from src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala
index 1507d152500..c96764cfd29 100644
--- a/src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala
+++ b/core/src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala
@@ -3,9 +3,8 @@
package com.microsoft.ml.spark.stages
-import com.microsoft.ml.spark.codegen.Config
import com.microsoft.ml.spark.core.test.base.TestBase
-import com.microsoft.ml.spark.core.test.fuzzing.{PyTestFuzzing, TestObject, TransformerFuzzing}
+import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import org.apache.spark.ml.util.MLReadable
class DropColumnsSuite extends TestBase with TransformerFuzzing[DropColumns] {
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/EnsembleByKeySuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/EnsembleByKeySuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/EnsembleByKeySuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/EnsembleByKeySuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/ExplodeSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/ExplodeSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/ExplodeSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/ExplodeSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/LambdaSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/LambdaSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/LambdaSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/LambdaSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/MiniBatchTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/MiniBatchTransformerSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/MiniBatchTransformerSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/MiniBatchTransformerSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/MultiColumnAdapterSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/MultiColumnAdapterSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/MultiColumnAdapterSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/MultiColumnAdapterSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/RenameColumnSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/RenameColumnSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/RenameColumnSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/RenameColumnSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/RepartitionSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/RepartitionSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/RepartitionSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/RepartitionSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/SelectColumnsSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/SelectColumnsSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/SelectColumnsSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/SelectColumnsSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/StratifiedRepartitionSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/StratifiedRepartitionSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/StratifiedRepartitionSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/StratifiedRepartitionSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/SummarizeDataSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/SummarizeDataSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/SummarizeDataSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/SummarizeDataSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/TextPreprocessorSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/TextPreprocessorSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/TextPreprocessorSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/TextPreprocessorSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/TimerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/TimerSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/TimerSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/TimerSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/UDFSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/UDFSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/UDFSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/UDFSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/UDFTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/UDFTransformerSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/UDFTransformerSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/UDFTransformerSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/UnicodeNormalizeSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/UnicodeNormalizeSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/UnicodeNormalizeSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/UnicodeNormalizeSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/train/VerifyComputeModelStatistics.scala b/core/src/test/scala/com/microsoft/ml/spark/train/VerifyComputeModelStatistics.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/train/VerifyComputeModelStatistics.scala
rename to core/src/test/scala/com/microsoft/ml/spark/train/VerifyComputeModelStatistics.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/train/VerifyComputePerInstanceStatistics.scala b/core/src/test/scala/com/microsoft/ml/spark/train/VerifyComputePerInstanceStatistics.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/train/VerifyComputePerInstanceStatistics.scala
rename to core/src/test/scala/com/microsoft/ml/spark/train/VerifyComputePerInstanceStatistics.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainClassifier.scala b/core/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainClassifier.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/train/VerifyTrainClassifier.scala
rename to core/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainClassifier.scala
index 959486a9093..387eb04e375 100644
--- a/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainClassifier.scala
+++ b/core/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainClassifier.scala
@@ -6,7 +6,6 @@ package com.microsoft.ml.spark.train
import java.io.File
import com.microsoft.ml.spark.core.schema.SchemaConstants
-import com.microsoft.ml.spark.core.test.base.TestBase
import com.microsoft.ml.spark.core.test.benchmarks.Benchmarks
import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject}
import com.microsoft.ml.spark.featurize.ValueIndexer
@@ -18,6 +17,7 @@ import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, Multiclas
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Row}
import com.microsoft.ml.spark.codegen.GenerationUtils
+import com.microsoft.ml.spark.core.test.base.TestBase
object ClassifierTestUtils {
diff --git a/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainRegressor.scala b/core/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainRegressor.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/train/VerifyTrainRegressor.scala
rename to core/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainRegressor.scala
diff --git a/src/main/R/model_downloader.R b/deep-learning/src/main/R/model_downloader.R
similarity index 100%
rename from src/main/R/model_downloader.R
rename to deep-learning/src/main/R/model_downloader.R
diff --git a/src/main/python/mmlspark/cntk/CNTKModel.py b/deep-learning/src/main/python/mmlspark/cntk/CNTKModel.py
similarity index 100%
rename from src/main/python/mmlspark/cntk/CNTKModel.py
rename to deep-learning/src/main/python/mmlspark/cntk/CNTKModel.py
diff --git a/src/main/python/mmlspark/image/ImageFeaturizer.py b/deep-learning/src/main/python/mmlspark/cntk/ImageFeaturizer.py
similarity index 94%
rename from src/main/python/mmlspark/image/ImageFeaturizer.py
rename to deep-learning/src/main/python/mmlspark/cntk/ImageFeaturizer.py
index 9c4ae54a29e..a85cd56a095 100644
--- a/src/main/python/mmlspark/image/ImageFeaturizer.py
+++ b/deep-learning/src/main/python/mmlspark/cntk/ImageFeaturizer.py
@@ -6,7 +6,7 @@
if sys.version >= '3':
basestring = str
-from mmlspark.image._ImageFeaturizer import _ImageFeaturizer
+from mmlspark.cntk._ImageFeaturizer import _ImageFeaturizer
from pyspark.ml.common import inherit_doc
from pyspark.sql import SparkSession
diff --git a/src/test/python/mmlsparktest/__init__.py b/deep-learning/src/main/python/mmlspark/cntk/__init__.py
similarity index 100%
rename from src/test/python/mmlsparktest/__init__.py
rename to deep-learning/src/main/python/mmlspark/cntk/__init__.py
diff --git a/src/main/scala/com/microsoft/CNTK/SerializableFunction.scala b/deep-learning/src/main/scala/com/microsoft/CNTK/SerializableFunction.scala
similarity index 100%
rename from src/main/scala/com/microsoft/CNTK/SerializableFunction.scala
rename to deep-learning/src/main/scala/com/microsoft/CNTK/SerializableFunction.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/cntk/CNTKFunctionParam.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/CNTKFunctionParam.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/cntk/CNTKFunctionParam.scala
rename to deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/CNTKFunctionParam.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/cntk/CNTKModel.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/CNTKModel.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/cntk/CNTKModel.scala
rename to deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/CNTKModel.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/cntk/ConversionUtils.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/ConversionUtils.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/cntk/ConversionUtils.scala
rename to deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/ConversionUtils.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/ImageFeaturizer.scala
similarity index 98%
rename from src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.scala
rename to deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/ImageFeaturizer.scala
index 2db42e83b0c..73dce569944 100644
--- a/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.scala
+++ b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/ImageFeaturizer.scala
@@ -1,20 +1,20 @@
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.
-package com.microsoft.ml.spark.image
+package com.microsoft.ml.spark.cntk
import com.microsoft.CNTK.CNTKExtensions._
import com.microsoft.CNTK.{SerializableFunction => CNTKFunction}
-import com.microsoft.ml.spark.cntk.CNTKModel
-import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol}
import com.microsoft.ml.spark.codegen.Wrappable
+import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol}
import com.microsoft.ml.spark.core.schema.{DatasetExtensions, ImageSchemaUtils}
import com.microsoft.ml.spark.downloader.ModelSchema
+import com.microsoft.ml.spark.image.{ResizeImageTransformer, UnrollBinaryImage, UnrollImage}
import com.microsoft.ml.spark.logging.BasicLogging
-import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer}
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
import org.apache.spark.ml.param._
import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer}
import org.apache.spark.sql.types.{BinaryType, StructType}
import org.apache.spark.sql.{DataFrame, Dataset}
@@ -132,7 +132,7 @@ class ImageFeaturizer(val uid: String) extends Transformer with HasInputCol with
/** @group getParam */
def getLayerNames: Array[String] = $(layerNames)
- setDefault(cutOutputLayers -> 1, outputCol -> (uid + "_output"), dropNa->true)
+ setDefault(cutOutputLayers -> 1, outputCol -> (uid + "_output"), dropNa -> true)
override def transform(dataset: Dataset[_]): DataFrame = {
logTransform[DataFrame]({
diff --git a/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.txt b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/ImageFeaturizer.txt
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.txt
rename to deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/ImageFeaturizer.txt
diff --git a/src/main/scala/com/microsoft/ml/spark/cntk/_CNTKModel.txt b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/_CNTKModel.txt
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/cntk/_CNTKModel.txt
rename to deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/_CNTKModel.txt
diff --git a/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala
similarity index 89%
rename from src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala
rename to deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala
index 3b68d0ee507..8c2a46c55e6 100644
--- a/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala
+++ b/deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala
@@ -7,6 +7,7 @@ import java.io._
import java.net.{URI, URL}
import java.util
+import com.microsoft.ml.spark.core.utils.FaultToleranceUtils
import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.{Configuration => HadoopConf}
import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path}
@@ -15,10 +16,8 @@ import org.apache.log4j.LogManager
import org.apache.spark.sql.SparkSession
import spray.json._
-import scala.annotation.tailrec
import scala.collection.JavaConverters._
-import scala.concurrent.duration.{Duration, FiniteDuration}
-import scala.concurrent.{Await, ExecutionContext, Future}
+import scala.concurrent.duration.Duration
/** Abstract representation of a repository for future expansion
*
@@ -34,32 +33,6 @@ private[spark] abstract class Repository[S <: Schema] {
}
-object FaultToleranceUtils {
- def retryWithTimeout[T](times: Int, timeout: Duration)(f: => T): T ={
- try {
- Await.result(Future(f)(ExecutionContext.global), timeout)
- } catch {
- case e: Exception if times >= 1 =>
- print(s"Received exception on call, retrying: $e")
- retryWithTimeout(times-1, timeout)(f)
- }
- }
-
- val Backoffs: Seq[Int] = Seq(0, 100, 200, 500)
-
- def retryWithTimeout[T](times: Seq[Int] = Backoffs)(f: => T): T ={
- try {
- f
- } catch {
- case e: Exception if times.nonEmpty =>
- println(s"Received exception on call, retrying: $e")
- Thread.sleep(times.head)
- retryWithTimeout(times.tail)(f)
- }
- }
-
-}
-
/** Exception returned if a repo cannot find the file
*
* @param uri : location of the file
@@ -90,7 +63,7 @@ private[spark] class HDFSRepo[S <: Schema](val uri: URI, val hconf: HadoopConf)
.filter(status =>
status.isFile & status.getPath.toString.endsWith(".meta"))
.map(status =>
- IOUtils.toString(fs.open(status.getPath).getWrappedStream))
+ IOUtils.toString(fs.open(status.getPath).getWrappedStream, "UTF-8"))
schemaStrings.map(s => s.parseJson.convertTo[S]).toList
}
@@ -121,7 +94,7 @@ private[spark] class HDFSRepo[S <: Schema](val uri: URI, val hconf: HadoopConf)
val newSchema = schema.updateURI(location)
val schemaPath = new Path(location.getPath + ".meta")
val osSchema = fs.create(schemaPath)
- val schemaIs = IOUtils.toInputStream(newSchema.toJson.prettyPrint)
+ val schemaIs = IOUtils.toInputStream(newSchema.toJson.prettyPrint, "UTF-8")
try {
HUtils.copyBytes(schemaIs, osSchema, hconf)
} finally {
@@ -157,9 +130,9 @@ private[spark] class DefaultModelRepo(val baseURL: URL) extends Repository[Model
val url = join(baseURL, "MANIFEST")
val manifestStream = toStream(url)
try {
- val modelStreams = IOUtils.readLines(manifestStream).asScala.map(fn => toStream(join(baseURL, fn)))
+ val modelStreams = IOUtils.readLines(manifestStream, "UTF-8").asScala.map(fn => toStream(join(baseURL, fn)))
try {
- modelStreams.map(s => IOUtils.toString(s).parseJson.convertTo[ModelSchema])
+ modelStreams.map(s => IOUtils.toString(s, "UTF-8").parseJson.convertTo[ModelSchema])
} finally {
modelStreams.foreach(_.close())
}
diff --git a/src/main/scala/com/microsoft/ml/spark/downloader/Schema.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/Schema.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/downloader/Schema.scala
rename to deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/Schema.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala
similarity index 97%
rename from src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala
rename to deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala
index 37b4b1ad615..f8483945360 100644
--- a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala
+++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala
@@ -9,11 +9,12 @@ import com.microsoft.CNTK.CNTKExtensions._
import com.microsoft.CNTK.{SerializableFunction => CNTKFunction, _}
import com.microsoft.ml.spark.core.env.StreamUtilities._
import com.microsoft.ml.spark.core.test.base.LinuxOnly
+import com.microsoft.ml.spark.image.ImageTestUtils
import org.apache.commons.io.IOUtils
import scala.collection.JavaConverters._
-class CNTKBindingSuite extends LinuxOnly with CNTKTestUtils {
+class CNTKBindingSuite extends LinuxOnly with ImageTestUtils {
def toSeqSeq(fvv: FloatVectorVector): Seq[Seq[Float]] = {
(0 until fvv.size.toInt).map(i =>
diff --git a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala
similarity index 97%
rename from src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala
rename to deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala
index 34893a7015c..8d2285be0ad 100644
--- a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala
+++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala
@@ -10,6 +10,7 @@ import com.microsoft.ml.spark.build.BuildInfo
import com.microsoft.ml.spark.core.env.FileUtilities
import com.microsoft.ml.spark.core.test.base.LinuxOnly
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
+import com.microsoft.ml.spark.image.ImageTestUtils
import org.apache.commons.io.FileUtils
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.linalg.DenseVector
@@ -21,7 +22,7 @@ import org.apache.spark.sql.types._
import scala.util.Random
-class CNTKModelSuite extends LinuxOnly with CNTKTestUtils with TransformerFuzzing[CNTKModel] {
+class CNTKModelSuite extends LinuxOnly with ImageTestUtils with TransformerFuzzing[CNTKModel] {
// TODO: Move away from getTempDirectoryPath and have TestBase provide one
@@ -54,7 +55,7 @@ class CNTKModelSuite extends LinuxOnly with CNTKTestUtils with TransformerFuzzin
.setOutputNodeIndex(0)
}
- lazy val images = testImages(spark)
+ override lazy val images = testImages(spark)
import spark.implicits._
diff --git a/src/test/scala/com/microsoft/ml/spark/image/ImageFeaturizerSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/ImageFeaturizerSuite.scala
similarity index 80%
rename from src/test/scala/com/microsoft/ml/spark/image/ImageFeaturizerSuite.scala
rename to deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/ImageFeaturizerSuite.scala
index 247c7a421e1..1f9ca641c5a 100644
--- a/src/test/scala/com/microsoft/ml/spark/image/ImageFeaturizerSuite.scala
+++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/ImageFeaturizerSuite.scala
@@ -1,31 +1,28 @@
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.
-package com.microsoft.ml.spark.image
+package com.microsoft.ml.spark.cntk
import java.io.File
-import java.net.{URI, URL}
+import java.net.URI
import com.microsoft.ml.spark.Secrets
import com.microsoft.ml.spark.build.BuildInfo
-import com.microsoft.ml.spark.cntk.CNTKTestUtils
import com.microsoft.ml.spark.core.env.FileUtilities
-import com.microsoft.ml.spark.core.test.base.TestBase
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
-import com.microsoft.ml.spark.core.utils.ModelEquality
import com.microsoft.ml.spark.downloader.{ModelDownloader, ModelSchema}
+import com.microsoft.ml.spark.image.ImageTestUtils
import com.microsoft.ml.spark.io.IOImplicits._
import com.microsoft.ml.spark.io.powerbi.PowerBIWriter
import com.microsoft.ml.spark.io.split1.FileReaderUtils
-import org.apache.commons.io.FileUtils
import org.apache.spark.injections.UDFUtils
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.functions.{col, udf}
+import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.StringType
-trait NetworkUtils extends CNTKTestUtils with FileReaderUtils {
+trait TrainedCNTKModelUtils extends ImageTestUtils with FileReaderUtils {
lazy val modelDir = new File(filesRoot, "CNTKModel")
lazy val modelDownloader = new ModelDownloader(spark, modelDir.toURI)
@@ -33,33 +30,6 @@ trait NetworkUtils extends CNTKTestUtils with FileReaderUtils {
lazy val resNetUri: URI = new File(modelDir, "ResNet50_ImageNet.model").toURI
lazy val resNet: ModelSchema = modelDownloader.downloadByName("ResNet50")
- lazy val images: DataFrame = spark.read.image.load(imagePath)
- .withColumnRenamed("image", inputCol)
- lazy val binaryImages: DataFrame = spark.read.binary.load(imagePath)
- .select(col("value.bytes").alias(inputCol))
-
- lazy val groceriesPath = FileUtilities.join(BuildInfo.datasetDir, "Images","Grocery")
- lazy val groceryImages: DataFrame = spark.read.image
- .option("dropInvalid", true)
- .load(groceriesPath + "**")
- .withColumnRenamed("image", inputCol)
-
- lazy val greyscaleImageLocation: String = {
- val loc = "/tmp/greyscale.jpg"
- val f = new File(loc)
- if (f.exists()) {f.delete()}
- FileUtils.copyURLToFile(new URL("https://mmlspark.blob.core.windows.net/datasets/LIME/greyscale.jpg"), f)
- loc
- }
-
- lazy val greyscaleImage: DataFrame = spark
- .read.image.load(greyscaleImageLocation)
- .select(col("image").alias(inputCol))
-
- lazy val greyscaleBinary: DataFrame = spark
- .read.binary.load(greyscaleImageLocation)
- .select(col("value.bytes").alias(inputCol))
-
def resNetModel(): ImageFeaturizer = new ImageFeaturizer()
.setInputCol(inputCol)
.setOutputCol(outputCol)
@@ -68,7 +38,7 @@ trait NetworkUtils extends CNTKTestUtils with FileReaderUtils {
}
class ImageFeaturizerSuite extends TransformerFuzzing[ImageFeaturizer]
- with NetworkUtils {
+ with TrainedCNTKModelUtils {
test("Image featurizer should reproduce the CIFAR10 experiment") {
print(spark)
diff --git a/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala
similarity index 97%
rename from src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala
rename to deep-learning/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala
index ee6d53933a0..f67e4b82d5c 100644
--- a/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala
+++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala
@@ -7,6 +7,7 @@ import java.io.File
import java.nio.file.Files
import com.microsoft.ml.spark.core.test.base.TestBase
+import com.microsoft.ml.spark.core.utils.FaultToleranceUtils
import org.apache.commons.io.FileUtils
import scala.collection.JavaConverters._
diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/ImageExplainersSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/explainers/ImageExplainersSuite.scala
similarity index 86%
rename from src/test/scala/com/microsoft/ml/spark/explainers/ImageExplainersSuite.scala
rename to deep-learning/src/test/scala/com/microsoft/ml/spark/explainers/ImageExplainersSuite.scala
index ae7103f2bd1..94e7d9aeb57 100644
--- a/src/test/scala/com/microsoft/ml/spark/explainers/ImageExplainersSuite.scala
+++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/explainers/ImageExplainersSuite.scala
@@ -3,16 +3,16 @@
package com.microsoft.ml.spark.explainers
+import java.io.File
+import java.net.URL
+
+import com.microsoft.ml.spark.cntk.{ImageFeaturizer, TrainedCNTKModelUtils}
import com.microsoft.ml.spark.core.test.base.TestBase
-import com.microsoft.ml.spark.image.{ImageFeaturizer, NetworkUtils}
import com.microsoft.ml.spark.io.IOImplicits._
import org.apache.commons.io.FileUtils
import org.apache.spark.sql.DataFrame
-import java.io.File
-import java.net.URL
-
-abstract class ImageExplainersSuite extends TestBase with NetworkUtils {
+abstract class ImageExplainersSuite extends TestBase with TrainedCNTKModelUtils {
lazy val greyhoundImageLocation: String = {
val loc = "/tmp/greyhound.jpg"
val f = new File(loc)
diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/split3/ImageLIMEExplainerSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/explainers/split2/ImageLIMEExplainerSuite.scala
similarity index 98%
rename from src/test/scala/com/microsoft/ml/spark/explainers/split3/ImageLIMEExplainerSuite.scala
rename to deep-learning/src/test/scala/com/microsoft/ml/spark/explainers/split2/ImageLIMEExplainerSuite.scala
index 41bc9b21ab2..131b69f6fdb 100644
--- a/src/test/scala/com/microsoft/ml/spark/explainers/split3/ImageLIMEExplainerSuite.scala
+++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/explainers/split2/ImageLIMEExplainerSuite.scala
@@ -1,13 +1,13 @@
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.
-package com.microsoft.ml.spark.explainers.split3
+package com.microsoft.ml.spark.explainers.split2
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import com.microsoft.ml.spark.explainers.BreezeUtils._
import com.microsoft.ml.spark.explainers.{ImageExplainersSuite, ImageFormat, ImageLIME, LocalExplainer}
-import com.microsoft.ml.spark.lime.SuperpixelData
import com.microsoft.ml.spark.io.IOImplicits._
+import com.microsoft.ml.spark.lime.SuperpixelData
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.functions.col
diff --git a/src/test/scala/com/microsoft/ml/spark/explainers/split2/ImageSHAPExplainerSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/explainers/split3/ImageSHAPExplainerSuite.scala
similarity index 97%
rename from src/test/scala/com/microsoft/ml/spark/explainers/split2/ImageSHAPExplainerSuite.scala
rename to deep-learning/src/test/scala/com/microsoft/ml/spark/explainers/split3/ImageSHAPExplainerSuite.scala
index 59fba17bb7a..1de490a4a8e 100644
--- a/src/test/scala/com/microsoft/ml/spark/explainers/split2/ImageSHAPExplainerSuite.scala
+++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/explainers/split3/ImageSHAPExplainerSuite.scala
@@ -1,11 +1,11 @@
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.
-package com.microsoft.ml.spark.explainers.split2
+package com.microsoft.ml.spark.explainers.split3
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
-import com.microsoft.ml.spark.explainers.{ImageExplainersSuite, ImageFormat, ImageSHAP, LocalExplainer}
import com.microsoft.ml.spark.explainers.BreezeUtils._
+import com.microsoft.ml.spark.explainers.{ImageExplainersSuite, ImageFormat, ImageSHAP, LocalExplainer}
import com.microsoft.ml.spark.lime.SuperpixelData
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.util.MLReadable
diff --git a/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/lime/ImageLIMESuite.scala
similarity index 65%
rename from src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala
rename to deep-learning/src/test/scala/com/microsoft/ml/spark/lime/ImageLIMESuite.scala
index e83f910e377..b53d206137c 100644
--- a/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala
+++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/lime/ImageLIMESuite.scala
@@ -7,82 +7,23 @@ import java.awt.image.BufferedImage
import java.io.File
import java.net.URL
-import breeze.linalg.{*, DenseMatrix}
-import breeze.stats.distributions.Rand
-import com.microsoft.ml.spark.core.test.base.TestBase
-import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject, TransformerFuzzing}
-import com.microsoft.ml.spark.image.{ImageFeaturizer, NetworkUtils}
+import com.microsoft.ml.spark.cntk.{ImageFeaturizer, TrainedCNTKModelUtils}
+import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import com.microsoft.ml.spark.io.IOImplicits._
import com.microsoft.ml.spark.io.image.ImageUtils
import com.microsoft.ml.spark.io.split1.FileReaderUtils
import com.microsoft.ml.spark.stages.UDFTransformer
import com.microsoft.ml.spark.stages.udfs.get_value_udf
import org.apache.commons.io.FileUtils
-import org.apache.spark.injections.UDFUtils
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.ml.param.DataFrameEquality
-import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.ml.{NamespaceInjections, PipelineModel}
import org.apache.spark.sql.functions.col
-import org.apache.spark.sql.types.DoubleType
import org.apache.spark.sql.{DataFrame, Row}
-trait LimeTestBase extends TestBase {
-
- import spark.implicits._
-
- lazy val nRows = 100
- lazy val d1 = 3
- lazy val d2 = 1
-
- lazy val m: DenseMatrix[Double] = new DenseMatrix(d1, d2, Array(1.0, -1.0, 2.0))
- lazy val x: DenseMatrix[Double] = DenseMatrix.rand(nRows, d1, Rand.gaussian)
- lazy val noise: DenseMatrix[Double] = DenseMatrix.rand(nRows, d2, Rand.gaussian) * 0.1
- lazy val y = x * m //+ noise
-
- lazy val xRows = x(*, ::).iterator.toSeq.map(dv => new DenseVector(dv.toArray))
- lazy val yRows = y(*, ::).iterator.toSeq.map(dv => dv(0))
- lazy val df = xRows.zip(yRows).toDF("features", "label")
-
- lazy val model = new LinearRegression().fit(df)
-
- lazy val lime = new TabularLIME()
- .setModel(model)
- .setInputCol("features")
- .setPredictionCol(model.getPredictionCol)
- .setOutputCol("out")
- .setNSamples(1000)
-
- lazy val limeModel = lime.fit(df)
-}
-
-class TabularLIMESuite extends EstimatorFuzzing[TabularLIME] with
- DataFrameEquality with LimeTestBase {
-
- test("text lime usage test check") {
- val results = limeModel.transform(df).select("out")
- .collect().map(_.getAs[DenseVector](0))
- results.foreach(result => assert(result === new DenseVector(m.data)))
- }
-
- override def testObjects(): Seq[TestObject[TabularLIME]] = Seq(new TestObject(lime, df))
-
- override def reader: MLReadable[_] = TabularLIME
-
- override def modelReader: MLReadable[_] = TabularLIMEModel
-}
-
-class TabularLIMEModelSuite extends TransformerFuzzing[TabularLIMEModel] with
- DataFrameEquality with LimeTestBase {
-
- override def testObjects(): Seq[TestObject[TabularLIMEModel]] = Seq(new TestObject(limeModel, df))
-
- override def reader: MLReadable[_] = TabularLIMEModel
-}
-
class ImageLIMESuite extends TransformerFuzzing[ImageLIME] with
- DataFrameEquality with NetworkUtils with FileReaderUtils {
+ DataFrameEquality with TrainedCNTKModelUtils with FileReaderUtils {
lazy val greyhoundImageLocation: String = {
val loc = "/tmp/greyhound.jpg"
diff --git a/docs/cogsvc.md b/docs/cogsvc.md
index edec95f3751..949ae14c96a 100644
--- a/docs/cogsvc.md
+++ b/docs/cogsvc.md
@@ -9,7 +9,7 @@
Azure Cognitive Services on Spark enable working with Azure’s Intelligent Services at massive scales with the Apache Spark™ distributed computing ecosystem. Cognitive Services on Spark allows users to embed general purpose and continuously improving intelligent models directly into their Apache Spark™ and SQL computations. This liberates developers from low-level networking details, so they can focus on creating intelligent, distributed applications. Each Cognitive Service acts as a SparkML transformer, so users can add services to existing SparkML pipelines. This is a great example of our [HTTP-on-Spark](http.md) capability that lets you interact with HTTP services from Apache Spark.
## Usage
-To see an example of Cognitive Services on Spark in action, take a look at [this sample notebook](../notebooks/samples/CognitiveServices%20-%20Celebrity%20Quote%20Analysis.ipynb).
+To see an example of Cognitive Services on Spark in action, take a look at [this sample notebook](../notebooks/CognitiveServices%20-%20Celebrity%20Quote%20Analysis.ipynb).
## Cognitive Services on Apache Sparkâ„¢
Currently, the following Cognitive Services are available on Apache Sparkâ„¢ through MMLSpark:
diff --git a/docs/datasets.md b/docs/datasets.md
index 8376027f4f4..595ae3d4098 100644
--- a/docs/datasets.md
+++ b/docs/datasets.md
@@ -24,7 +24,7 @@ tab-separated file with 2 columns (`rating`, `text`) and 10000 rows. The
contains free-form text strings in English language. You can use
`mmlspark.TextFeaturizer` to convert the text into feature vectors for machine
learning models ([see
-example](../notebooks/samples/201%20-%20Amazon%20Book%20Reviews%20-%20TextFeaturizer.ipynb)).
+example](../notebooks/201%20-%20Amazon%20Book%20Reviews%20-%20TextFeaturizer.ipynb)).
The example dataset is available
[here](https://mmlspark.azureedge.net/datasets/BookReviewsFromAmazon10K.tsv);
@@ -48,7 +48,7 @@ The example dataset is available
the original dataset is available [Krizhevsky's
page](https://www.cs.toronto.edu/~kriz/cifar.html). The dataset has been
packaged into a gzipped tar archive. See notebook [301 - CIFAR10 CNTK CNN
-Evaluation](../notebooks/samples/301%20-%20CIFAR10%20CNTK%20CNN%20Evaluation.ipynb)
+Evaluation](../notebooks/301%20-%20CIFAR10%20CNTK%20CNN%20Evaluation.ipynb)
for an example how to extract the image data.
Reference: [_Learning Multiple Layers of Features from Tiny
diff --git a/docs/lightgbm.md b/docs/lightgbm.md
index fed5bc34131..87d5c366f2e 100644
--- a/docs/lightgbm.md
+++ b/docs/lightgbm.md
@@ -49,7 +49,7 @@ model = LightGBMRegressor(application='quantile',
```
For an end to end application, check out the LightGBM [notebook
-example](../notebooks/samples/LightGBM%20-%20Quantile%20Regression%20for%20Drug%20Discovery.ipynb).
+example](../notebooks/LightGBM%20-%20Quantile%20Regression%20for%20Drug%20Discovery.ipynb).
### Architecture
diff --git a/docs/mmlspark-serving.md b/docs/mmlspark-serving.md
index d59e3e0c58a..9471644805f 100644
--- a/docs/mmlspark-serving.md
+++ b/docs/mmlspark-serving.md
@@ -25,7 +25,7 @@
### Jupyter Notebook Examples
-- [Deploy a classifier trained on the Adult Census Dataset](../notebooks/samples/SparkServing%20-%20Deploying%20a%20Classifier.ipynb)
+- [Deploy a classifier trained on the Adult Census Dataset](../notebooks/SparkServing%20-%20Deploying%20a%20Classifier.ipynb)
- More coming soon!
### Spark Serving Hello World
diff --git a/docs/vw.md b/docs/vw.md
index 6deaeedf089..ddb0b7f6920 100644
--- a/docs/vw.md
+++ b/docs/vw.md
@@ -58,7 +58,7 @@ model = (VowpalWabbitRegressor(args="--holdout_off --loss_function quantile -q :
Through the args parameter you can pass command line parameters to VW as documented in the [VW Wiki](https://github.com/vowpalWabbit/vowpal_wabbit/wiki/Command-Line-Arguments).
For an end to end application, check out the VowpalWabbit [notebook
-example](../notebooks/samples/Vowpal%20Wabbit%20-%20Quantile%20Regression%20for%20Drug%20Discovery.ipynb]).
+example](../notebooks/Vowpal%20Wabbit%20-%20Quantile%20Regression%20for%20Drug%20Discovery.ipynb]).
### Hyper-parameter tuning
diff --git a/environment.yaml b/environment.yaml
index 1c1994e7858..338862d1001 100644
--- a/environment.yaml
+++ b/environment.yaml
@@ -6,6 +6,7 @@ dependencies:
- python=3.8.8
- pyspark=3.1.2
- requests
+ - pip
- r-base
- r-dplyr
- r-sparklyr
diff --git a/src/main/python/mmlspark/lightgbm/LightGBMClassificationModel.py b/lightgbm/src/main/python/mmlspark/lightgbm/LightGBMClassificationModel.py
similarity index 100%
rename from src/main/python/mmlspark/lightgbm/LightGBMClassificationModel.py
rename to lightgbm/src/main/python/mmlspark/lightgbm/LightGBMClassificationModel.py
diff --git a/src/main/python/mmlspark/lightgbm/LightGBMRankerModel.py b/lightgbm/src/main/python/mmlspark/lightgbm/LightGBMRankerModel.py
similarity index 100%
rename from src/main/python/mmlspark/lightgbm/LightGBMRankerModel.py
rename to lightgbm/src/main/python/mmlspark/lightgbm/LightGBMRankerModel.py
diff --git a/src/main/python/mmlspark/lightgbm/LightGBMRegressionModel.py b/lightgbm/src/main/python/mmlspark/lightgbm/LightGBMRegressionModel.py
similarity index 100%
rename from src/main/python/mmlspark/lightgbm/LightGBMRegressionModel.py
rename to lightgbm/src/main/python/mmlspark/lightgbm/LightGBMRegressionModel.py
diff --git a/src/test/python/mmlsparktest/cognitive/__init__.py b/lightgbm/src/main/python/mmlspark/lightgbm/__init__.py
similarity index 100%
rename from src/test/python/mmlsparktest/cognitive/__init__.py
rename to lightgbm/src/main/python/mmlspark/lightgbm/__init__.py
diff --git a/src/main/python/mmlspark/lightgbm/mixin.py b/lightgbm/src/main/python/mmlspark/lightgbm/mixin.py
similarity index 100%
rename from src/main/python/mmlspark/lightgbm/mixin.py
rename to lightgbm/src/main/python/mmlspark/lightgbm/mixin.py
diff --git a/src/main/scala/com/microsoft/lightgbm/SWIG.scala b/lightgbm/src/main/scala/com/microsoft/lightgbm/SWIG.scala
similarity index 100%
rename from src/main/scala/com/microsoft/lightgbm/SWIG.scala
rename to lightgbm/src/main/scala/com/microsoft/lightgbm/SWIG.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMBase.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMBase.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMBase.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMBase.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.txt b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.txt
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.txt
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.txt
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMConstants.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMConstants.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMConstants.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMConstants.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMDelegate.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMDelegate.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMDelegate.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMDelegate.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMModelMethods.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMModelMethods.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMModelMethods.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMModelMethods.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.txt b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.txt
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.txt
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.txt
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.txt b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.txt
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.txt
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.txt
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMUtils.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMUtils.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMUtils.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMUtils.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/PartitionProcessor.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/PartitionProcessor.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/PartitionProcessor.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/PartitionProcessor.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala
similarity index 99%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala
index eddce5cc295..1c420b1d063 100644
--- a/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala
+++ b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala
@@ -8,7 +8,7 @@ import java.net._
import com.microsoft.ml.lightgbm._
import com.microsoft.ml.spark.core.env.StreamUtilities._
-import com.microsoft.ml.spark.downloader.FaultToleranceUtils
+import com.microsoft.ml.spark.core.utils.FaultToleranceUtils
import com.microsoft.ml.spark.lightgbm.booster.LightGBMBooster
import com.microsoft.ml.spark.lightgbm.dataset.{DatasetUtils, LightGBMDataset}
import com.microsoft.ml.spark.lightgbm.params.{ClassifierTrainParams, TrainParams}
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/booster/LightGBMBooster.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/booster/LightGBMBooster.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/booster/LightGBMBooster.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/booster/LightGBMBooster.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/DatasetUtils.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/DatasetUtils.scala
similarity index 98%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/DatasetUtils.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/DatasetUtils.scala
index a404a42e37f..02ba5b698e1 100644
--- a/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/DatasetUtils.scala
+++ b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/DatasetUtils.scala
@@ -134,8 +134,9 @@ object DatasetUtils {
/**
* Sample the first several rows to determine whether to construct sparse or dense matrix in lightgbm native code.
- * @param rowsIter Iterator of rows.
- * @param schema The schema.
+ *
+ * @param rowsIter Iterator of rows.
+ * @param schema The schema.
* @param columnParams The column parameters.
* @return A reconstructed iterator with the same original rows and whether the matrix should be sparse or dense.
*/
@@ -158,7 +159,7 @@ object DatasetUtils {
}
def addFeaturesToChunkedArray(featuresChunkedArrayOpt: Option[doubleChunkedArray], numCols: Int,
- rowAsDoubleArray: Array[Double]): Unit = {
+ rowAsDoubleArray: Array[Double]): Unit = {
featuresChunkedArrayOpt.foreach { featuresChunkedArray =>
rowAsDoubleArray.foreach { doubleVal =>
featuresChunkedArray.add(doubleVal)
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/LightGBMDataset.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/LightGBMDataset.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/LightGBMDataset.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/LightGBMDataset.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjParam.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjParam.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjParam.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjParam.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjTrait.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjTrait.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjTrait.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjTrait.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMBoosterParam.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMBoosterParam.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMBoosterParam.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMBoosterParam.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMParams.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMParams.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMParams.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMParams.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/TrainParams.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/TrainParams.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/TrainParams.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/TrainParams.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/swig/SwigUtils.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/swig/SwigUtils.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/swig/SwigUtils.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/swig/SwigUtils.scala
diff --git a/src/test/resources/benchmarks/benchmarks_VerifyLightGBMClassifier.csv b/lightgbm/src/test/resources/benchmarks/benchmarks_VerifyLightGBMClassifier.csv
similarity index 100%
rename from src/test/resources/benchmarks/benchmarks_VerifyLightGBMClassifier.csv
rename to lightgbm/src/test/resources/benchmarks/benchmarks_VerifyLightGBMClassifier.csv
diff --git a/src/test/resources/benchmarks/benchmarks_VerifyLightGBMRegressor.csv b/lightgbm/src/test/resources/benchmarks/benchmarks_VerifyLightGBMRegressor.csv
similarity index 100%
rename from src/test/resources/benchmarks/benchmarks_VerifyLightGBMRegressor.csv
rename to lightgbm/src/test/resources/benchmarks/benchmarks_VerifyLightGBMRegressor.csv
diff --git a/src/test/scala/com/microsoft/ml/spark/lightgbm/split1/VerifyLightGBMClassifier.scala b/lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split1/VerifyLightGBMClassifier.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/lightgbm/split1/VerifyLightGBMClassifier.scala
rename to lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split1/VerifyLightGBMClassifier.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRanker.scala b/lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRanker.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRanker.scala
rename to lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRanker.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRegressor.scala b/lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRegressor.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRegressor.scala
rename to lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRegressor.scala
diff --git a/notebooks/samples/AzureSearchIndex - Met Artworks.ipynb b/notebooks/AzureSearchIndex - Met Artworks.ipynb
similarity index 100%
rename from notebooks/samples/AzureSearchIndex - Met Artworks.ipynb
rename to notebooks/AzureSearchIndex - Met Artworks.ipynb
diff --git a/notebooks/samples/Classification - Adult Census with Vowpal Wabbit.ipynb b/notebooks/Classification - Adult Census with Vowpal Wabbit.ipynb
similarity index 98%
rename from notebooks/samples/Classification - Adult Census with Vowpal Wabbit.ipynb
rename to notebooks/Classification - Adult Census with Vowpal Wabbit.ipynb
index e7098605ccc..4608bce764e 100644
--- a/notebooks/samples/Classification - Adult Census with Vowpal Wabbit.ipynb
+++ b/notebooks/Classification - Adult Census with Vowpal Wabbit.ipynb
@@ -8,7 +8,7 @@
"# Classification - Adult Census using Vowpal Wabbit in MMLSpark\n",
"\n",
"In this example, we predict incomes from the *Adult Census* dataset using Vowpal Wabbit (VW) classifier in MMLSpark.\n",
- "First, we read the data and split it into train and test sets as in this [example](https://github.com/Azure/mmlspark/blob/master/notebooks/samples/Classification%20-%20Adult%20Census.ipynb\n",
+ "First, we read the data and split it into train and test sets as in this [example](https://github.com/Azure/mmlspark/blob/master/notebooks/Classification%20-%20Adult%20Census.ipynb\n",
")."
]
},
diff --git a/notebooks/samples/Classification - Adult Census.ipynb b/notebooks/Classification - Adult Census.ipynb
similarity index 100%
rename from notebooks/samples/Classification - Adult Census.ipynb
rename to notebooks/Classification - Adult Census.ipynb
diff --git a/notebooks/samples/Classification - Before and After MMLSpark.ipynb b/notebooks/Classification - Before and After MMLSpark.ipynb
similarity index 100%
rename from notebooks/samples/Classification - Before and After MMLSpark.ipynb
rename to notebooks/Classification - Before and After MMLSpark.ipynb
diff --git a/notebooks/samples/Classification - Twitter Sentiment with Vowpal Wabbit.ipynb b/notebooks/Classification - Twitter Sentiment with Vowpal Wabbit.ipynb
similarity index 100%
rename from notebooks/samples/Classification - Twitter Sentiment with Vowpal Wabbit.ipynb
rename to notebooks/Classification - Twitter Sentiment with Vowpal Wabbit.ipynb
diff --git a/notebooks/samples/Cognitive Services - Overview.ipynb b/notebooks/Cognitive Services - Overview.ipynb
similarity index 100%
rename from notebooks/samples/Cognitive Services - Overview.ipynb
rename to notebooks/Cognitive Services - Overview.ipynb
diff --git a/notebooks/samples/CognitiveServices - Celebrity Quote Analysis.ipynb b/notebooks/CognitiveServices - Celebrity Quote Analysis.ipynb
similarity index 100%
rename from notebooks/samples/CognitiveServices - Celebrity Quote Analysis.ipynb
rename to notebooks/CognitiveServices - Celebrity Quote Analysis.ipynb
diff --git a/notebooks/samples/ConditionalKNN - Exploring Art Across Cultures.ipynb b/notebooks/ConditionalKNN - Exploring Art Across Cultures.ipynb
similarity index 100%
rename from notebooks/samples/ConditionalKNN - Exploring Art Across Cultures.ipynb
rename to notebooks/ConditionalKNN - Exploring Art Across Cultures.ipynb
diff --git a/notebooks/samples/CyberML - Anomalous Access Detection.ipynb b/notebooks/CyberML - Anomalous Access Detection.ipynb
similarity index 100%
rename from notebooks/samples/CyberML - Anomalous Access Detection.ipynb
rename to notebooks/CyberML - Anomalous Access Detection.ipynb
diff --git a/notebooks/samples/DeepLearning - BiLSTM Medical Entity Extraction.ipynb b/notebooks/DeepLearning - BiLSTM Medical Entity Extraction.ipynb
similarity index 100%
rename from notebooks/samples/DeepLearning - BiLSTM Medical Entity Extraction.ipynb
rename to notebooks/DeepLearning - BiLSTM Medical Entity Extraction.ipynb
diff --git a/notebooks/samples/DeepLearning - CIFAR10 Convolutional Network.ipynb b/notebooks/DeepLearning - CIFAR10 Convolutional Network.ipynb
similarity index 100%
rename from notebooks/samples/DeepLearning - CIFAR10 Convolutional Network.ipynb
rename to notebooks/DeepLearning - CIFAR10 Convolutional Network.ipynb
diff --git a/notebooks/samples/DeepLearning - Flower Image Classification.ipynb b/notebooks/DeepLearning - Flower Image Classification.ipynb
similarity index 98%
rename from notebooks/samples/DeepLearning - Flower Image Classification.ipynb
rename to notebooks/DeepLearning - Flower Image Classification.ipynb
index 0b6100ae81f..165bd30ce17 100644
--- a/notebooks/samples/DeepLearning - Flower Image Classification.ipynb
+++ b/notebooks/DeepLearning - Flower Image Classification.ipynb
@@ -51,7 +51,8 @@
"outputs": [],
"source": [
"from mmlspark.opencv import ImageTransformer\n",
- "from mmlspark.image import UnrollImage, ImageFeaturizer\n",
+ "from mmlspark.image import UnrollImage\n",
+ "from mmlspark.cntk import ImageFeaturizer\n",
"from mmlspark.stages import *\n",
"\n",
"# Make some featurizers\n",
@@ -220,4 +221,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
-}
+}
\ No newline at end of file
diff --git a/notebooks/samples/DeepLearning - Transfer Learning.ipynb b/notebooks/DeepLearning - Transfer Learning.ipynb
similarity index 100%
rename from notebooks/samples/DeepLearning - Transfer Learning.ipynb
rename to notebooks/DeepLearning - Transfer Learning.ipynb
diff --git a/notebooks/samples/HttpOnSpark - Working with Arbitrary Web APIs.ipynb b/notebooks/HttpOnSpark - Working with Arbitrary Web APIs.ipynb
similarity index 100%
rename from notebooks/samples/HttpOnSpark - Working with Arbitrary Web APIs.ipynb
rename to notebooks/HttpOnSpark - Working with Arbitrary Web APIs.ipynb
diff --git a/notebooks/samples/HyperParameterTuning - Fighting Breast Cancer.ipynb b/notebooks/HyperParameterTuning - Fighting Breast Cancer.ipynb
similarity index 100%
rename from notebooks/samples/HyperParameterTuning - Fighting Breast Cancer.ipynb
rename to notebooks/HyperParameterTuning - Fighting Breast Cancer.ipynb
diff --git a/notebooks/samples/LightGBM - Overview.ipynb b/notebooks/LightGBM - Overview.ipynb
similarity index 100%
rename from notebooks/samples/LightGBM - Overview.ipynb
rename to notebooks/LightGBM - Overview.ipynb
diff --git a/notebooks/samples/ModelInterpretation - Snow Leopard Detection.ipynb b/notebooks/ModelInterpretation - Snow Leopard Detection.ipynb
similarity index 99%
rename from notebooks/samples/ModelInterpretation - Snow Leopard Detection.ipynb
rename to notebooks/ModelInterpretation - Snow Leopard Detection.ipynb
index 097cb3dee18..4be5c881bc3 100644
--- a/notebooks/samples/ModelInterpretation - Snow Leopard Detection.ipynb
+++ b/notebooks/ModelInterpretation - Snow Leopard Detection.ipynb
@@ -198,7 +198,7 @@
"from pyspark.ml.classification import LogisticRegression\n",
"from pyspark.sql.functions import udf\n",
"from mmlspark.downloader import ModelDownloader\n",
- "from mmlspark.image import ImageFeaturizer \n",
+ "from mmlspark.cntk import ImageFeaturizer\n",
"from mmlspark.stages import UDFTransformer\n",
"from pyspark.sql.types import *\n",
"\n",
diff --git a/notebooks/samples/OpenCV - Pipeline Image Transformations.ipynb b/notebooks/OpenCV - Pipeline Image Transformations.ipynb
similarity index 100%
rename from notebooks/samples/OpenCV - Pipeline Image Transformations.ipynb
rename to notebooks/OpenCV - Pipeline Image Transformations.ipynb
diff --git a/notebooks/samples/Regression - Flight Delays with DataCleaning.ipynb b/notebooks/Regression - Flight Delays with DataCleaning.ipynb
similarity index 100%
rename from notebooks/samples/Regression - Flight Delays with DataCleaning.ipynb
rename to notebooks/Regression - Flight Delays with DataCleaning.ipynb
diff --git a/notebooks/samples/Regression - Auto Imports.ipynb b/notebooks/Regression - Auto Imports.ipynb
similarity index 100%
rename from notebooks/samples/Regression - Auto Imports.ipynb
rename to notebooks/Regression - Auto Imports.ipynb
diff --git a/notebooks/samples/Regression - Flight Delays.ipynb b/notebooks/Regression - Flight Delays.ipynb
similarity index 100%
rename from notebooks/samples/Regression - Flight Delays.ipynb
rename to notebooks/Regression - Flight Delays.ipynb
diff --git a/notebooks/samples/Regression - Vowpal Wabbit vs. LightGBM vs. Linear Regressor.ipynb b/notebooks/Regression - Vowpal Wabbit vs. LightGBM vs. Linear Regressor.ipynb
similarity index 100%
rename from notebooks/samples/Regression - Vowpal Wabbit vs. LightGBM vs. Linear Regressor.ipynb
rename to notebooks/Regression - Vowpal Wabbit vs. LightGBM vs. Linear Regressor.ipynb
diff --git a/notebooks/samples/SparkServing - Deploying a Classifier.ipynb b/notebooks/SparkServing - Deploying a Classifier.ipynb
similarity index 100%
rename from notebooks/samples/SparkServing - Deploying a Classifier.ipynb
rename to notebooks/SparkServing - Deploying a Classifier.ipynb
diff --git a/notebooks/samples/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb b/notebooks/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb
similarity index 100%
rename from notebooks/samples/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb
rename to notebooks/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb
diff --git a/notebooks/samples/TextAnalytics - Amazon Book Reviews.ipynb b/notebooks/TextAnalytics - Amazon Book Reviews.ipynb
similarity index 100%
rename from notebooks/samples/TextAnalytics - Amazon Book Reviews.ipynb
rename to notebooks/TextAnalytics - Amazon Book Reviews.ipynb
diff --git a/notebooks/samples/Vowpal Wabbit - Overview.ipynb b/notebooks/Vowpal Wabbit - Overview.ipynb
similarity index 100%
rename from notebooks/samples/Vowpal Wabbit - Overview.ipynb
rename to notebooks/Vowpal Wabbit - Overview.ipynb
diff --git a/src/main/python/mmlspark/opencv/ImageTransformer.py b/opencv/src/main/python/mmlspark/opencv/ImageTransformer.py
similarity index 100%
rename from src/main/python/mmlspark/opencv/ImageTransformer.py
rename to opencv/src/main/python/mmlspark/opencv/ImageTransformer.py
diff --git a/src/test/python/mmlsparktest/cyber/__init__.py b/opencv/src/main/python/mmlspark/opencv/__init__.py
similarity index 100%
rename from src/test/python/mmlsparktest/cyber/__init__.py
rename to opencv/src/main/python/mmlspark/opencv/__init__.py
diff --git a/src/main/scala/com/microsoft/ml/spark/image/ImageSetAugmenter.scala b/opencv/src/main/scala/com/microsoft/ml/spark/opencv/ImageSetAugmenter.scala
similarity index 96%
rename from src/main/scala/com/microsoft/ml/spark/image/ImageSetAugmenter.scala
rename to opencv/src/main/scala/com/microsoft/ml/spark/opencv/ImageSetAugmenter.scala
index d957e949630..ae89e80dd91 100644
--- a/src/main/scala/com/microsoft/ml/spark/image/ImageSetAugmenter.scala
+++ b/opencv/src/main/scala/com/microsoft/ml/spark/opencv/ImageSetAugmenter.scala
@@ -1,12 +1,11 @@
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.
-package com.microsoft.ml.spark.image
+package com.microsoft.ml.spark.opencv
import com.microsoft.ml.spark.codegen.Wrappable
import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol}
import com.microsoft.ml.spark.logging.BasicLogging
-import com.microsoft.ml.spark.opencv.{Flip, ImageTransformer}
import org.apache.spark.ml._
import org.apache.spark.ml.image.ImageSchema
import org.apache.spark.ml.param._
diff --git a/src/main/scala/com/microsoft/ml/spark/opencv/ImageTransformer.scala b/opencv/src/main/scala/com/microsoft/ml/spark/opencv/ImageTransformer.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/opencv/ImageTransformer.scala
rename to opencv/src/main/scala/com/microsoft/ml/spark/opencv/ImageTransformer.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/opencv/OpenCVUtils.scala b/opencv/src/main/scala/com/microsoft/ml/spark/opencv/OpenCVUtils.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/opencv/OpenCVUtils.scala
rename to opencv/src/main/scala/com/microsoft/ml/spark/opencv/OpenCVUtils.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala b/opencv/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala
similarity index 97%
rename from src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala
rename to opencv/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala
index 5d05a243ccf..b20b309bb05 100644
--- a/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala
+++ b/opencv/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala
@@ -8,15 +8,15 @@ import java.net.URL
import com.microsoft.ml.spark.core.env.FileUtilities
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
-import com.microsoft.ml.spark.opencv.{ImageTestUtils, ImageTransformer}
+import com.microsoft.ml.spark.io.IOImplicits._
+import com.microsoft.ml.spark.opencv.{ImageTransformer, OpenCVTestUtils}
+import org.apache.commons.io.FileUtils
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.{DataFrame, Row}
-import com.microsoft.ml.spark.io.IOImplicits._
-import org.apache.commons.io.FileUtils
class ResizeImageTransformerSuite extends TransformerFuzzing[ResizeImageTransformer]
- with ImageTestUtils {
+ with OpenCVTestUtils {
lazy val images: DataFrame = spark.read.image
.option("dropInvalid", true).load(FileUtilities.join(fileLocation, "**").toString)
diff --git a/src/test/scala/com/microsoft/ml/spark/image/ImageSetAugmenterSuite.scala b/opencv/src/test/scala/com/microsoft/ml/spark/opencv/ImageSetAugmenterSuite.scala
similarity index 97%
rename from src/test/scala/com/microsoft/ml/spark/image/ImageSetAugmenterSuite.scala
rename to opencv/src/test/scala/com/microsoft/ml/spark/opencv/ImageSetAugmenterSuite.scala
index 51993e8e955..427f84d08fb 100644
--- a/src/test/scala/com/microsoft/ml/spark/image/ImageSetAugmenterSuite.scala
+++ b/opencv/src/test/scala/com/microsoft/ml/spark/opencv/ImageSetAugmenterSuite.scala
@@ -1,7 +1,7 @@
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.
-package com.microsoft.ml.spark.image
+package com.microsoft.ml.spark.opencv
import com.microsoft.ml.spark.build.BuildInfo
import com.microsoft.ml.spark.core.test.base.LinuxOnly
diff --git a/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala b/opencv/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala
similarity index 98%
rename from src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala
rename to opencv/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala
index 6c7ab6dfe53..62a43aa5e93 100644
--- a/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala
+++ b/opencv/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala
@@ -23,7 +23,7 @@ import org.opencv.imgproc.Imgproc
import org.scalactic.Equality
import org.scalatest.Assertion
-trait ImageTestUtils {
+trait OpenCVTestUtils {
lazy protected val fileLocation = FileUtilities.join(BuildInfo.datasetDir, "Images", "Grocery")
protected def selectTestImageBytes(images: DataFrame): Array[Byte] = {
@@ -81,7 +81,7 @@ trait ImageTestUtils {
}
-class UnrollImageSuite extends TransformerFuzzing[UnrollImage] with ImageTestUtils with DataFrameEquality {
+class UnrollImageSuite extends TransformerFuzzing[UnrollImage] with OpenCVTestUtils with DataFrameEquality {
lazy val filesRoot = BuildInfo.datasetDir
lazy val imagePath = FileUtilities.join(filesRoot,"Images", "CIFAR").toString
@@ -128,7 +128,7 @@ class UnrollImageSuite extends TransformerFuzzing[UnrollImage] with ImageTestUti
}
class UnrollBinaryImageSuite extends TransformerFuzzing[UnrollBinaryImage]
- with ImageTestUtils with DataFrameEquality {
+ with OpenCVTestUtils with DataFrameEquality {
lazy val filesRoot = BuildInfo.datasetDir
lazy val imagePath = FileUtilities.join(filesRoot, "Images", "CIFAR").toString
@@ -163,7 +163,7 @@ class UnrollBinaryImageSuite extends TransformerFuzzing[UnrollBinaryImage]
override def reader: UnrollBinaryImage.type = UnrollBinaryImage
}
-class ImageTransformerSuite extends TransformerFuzzing[ImageTransformer] with ImageTestUtils {
+class ImageTransformerSuite extends TransformerFuzzing[ImageTransformer] with OpenCVTestUtils {
//TODO this is needed to stop the build from freezing
override def assertDFEq(df1: DataFrame, df2: DataFrame)(implicit eq: Equality[DataFrame]): Unit = {
diff --git a/pipeline.yaml b/pipeline.yaml
index 7a4eaf66ddf..eb25b5c40c4 100644
--- a/pipeline.yaml
+++ b/pipeline.yaml
@@ -33,7 +33,6 @@ jobs:
pool:
vmImage: ubuntu-18.04
steps:
- - template: templates/ivy_cache.yml
- task: AzureCLI@1
displayName: 'Style Check'
inputs:
@@ -46,7 +45,7 @@ jobs:
pool:
vmImage: ubuntu-18.04
steps:
- - template: templates/ivy_cache_2.yml
+ #- template: templates/ivy_cache.yml
- bash: echo "##vso[task.prependpath]$CONDA/bin"
displayName: Add conda to PATH
- bash: conda info
@@ -87,7 +86,7 @@ jobs:
pool:
vmImage: ubuntu-18.04
steps:
- - template: templates/ivy_cache_2.yml
+ #- template: templates/ivy_cache.yml
- bash: echo "##vso[task.prependpath]$CONDA/bin"
displayName: Add conda to PATH
- bash: conda info
@@ -128,14 +127,13 @@ jobs:
pool:
vmImage: ubuntu-18.04
steps:
- - template: templates/ivy_cache.yml
- task: AzureCLI@1
displayName: 'Get Docker Tag + Version'
inputs:
azureSubscription: 'MMLSpark Build'
scriptLocation: inlineScript
inlineScript: |
- VERSION=$(sbt version | tail -1 | cut -d' ' -f2 | sed 's/\x1b\[[0-9;]*m//g')
+ VERSION=$(sbt "core/version" | tail -1 | cut -d' ' -f2 | sed 's/\x1b\[[0-9;]*m//g')
echo '##vso[task.setvariable variable=version]'$VERSION
echo '##vso[task.setvariable variable=gittag]'$(git tag -l --points-at HEAD)
- task: Docker@2
@@ -226,7 +224,7 @@ jobs:
pool:
vmImage: ubuntu-18.04
steps:
- - template: templates/ivy_cache_2.yml
+ #- template: templates/ivy_cache.yml
- bash: echo "##vso[task.prependpath]$CONDA/bin"
displayName: Add conda to PATH
- bash: conda env create -f environment.yaml
@@ -275,7 +273,7 @@ jobs:
pool:
vmImage: ubuntu-18.04
steps:
- - template: templates/ivy_cache_2.yml
+ #- template: templates/ivy_cache_2.yml
- bash: echo "##vso[task.prependpath]$CONDA/bin"
displayName: Add conda to PATH
- bash: conda env create -f environment.yaml
@@ -389,7 +387,7 @@ jobs:
vw:
PACKAGE: "vw"
steps:
- - template: templates/ivy_cache_2.yml
+ #- template: templates/ivy_cache.yml
- task: AzureCLI@1
displayName: 'Setup repo'
inputs:
diff --git a/project/BlobMavenPlugin.scala b/project/BlobMavenPlugin.scala
new file mode 100644
index 00000000000..de8114172e0
--- /dev/null
+++ b/project/BlobMavenPlugin.scala
@@ -0,0 +1,48 @@
+import java.io.File
+
+import BlobMavenPlugin.autoImport.publishBlob
+import BuildUtils.{join, uploadToBlob}
+import sbt._
+import Keys._
+import org.apache.ivy.core.IvyPatternHelper
+
+//noinspection ScalaStyle
+object BlobMavenPlugin extends AutoPlugin {
+ override def trigger = allRequirements
+
+ object autoImport {
+ val publishBlob = TaskKey[Unit]("publishBlob", "publish the library to mmlspark blob")
+ val blobArtifactInfo = SettingKey[String]("blobArtifactInfo")
+ }
+
+ import autoImport._
+
+ override def requires: Plugins = sbt.Plugins.empty
+
+ override lazy val projectSettings: Seq[Setting[_]] = Seq(
+ publishBlob := {
+ publishM2.value
+ //TODO make this more general - 1.0 is a hack and not sure of a way to get this with sbt keys
+ val sourceArtifactName = s"${moduleName.value}_${scalaBinaryVersion.value}_1.0"
+ val destArtifactName = s"${moduleName.value}"
+ val repositoryDir = new File(new URI(Resolver.mavenLocal.root))
+ val orgDirs = organization.value.split(".".toCharArray.head)
+ val localPackageFolder = join(repositoryDir, orgDirs ++ Seq(sourceArtifactName, version.value):_*).toString
+ val blobMavenFolder = (orgDirs ++ Seq(destArtifactName, version.value)).mkString("/")
+ uploadToBlob(localPackageFolder, blobMavenFolder, "maven")
+ println(blobArtifactInfo.value)
+ },
+ blobArtifactInfo := {
+ s"""
+ |MMLSpark Build and Release Information
+ |---------------
+ |
+ |### Maven Coordinates
+ | `${organization.value}:${moduleName.value}:${version.value}`
+ |
+ |### Maven Resolver
+ | `https://mmlspark.azureedge.net/maven`
+ |""".stripMargin
+ }
+ )
+}
\ No newline at end of file
diff --git a/project/CodegenPlugin.scala b/project/CodegenPlugin.scala
new file mode 100644
index 00000000000..0c660663f93
--- /dev/null
+++ b/project/CodegenPlugin.scala
@@ -0,0 +1,245 @@
+import java.io.File
+
+import BuildUtils.{join, runCmd, singleUploadToBlob, zipFolder}
+import CondaPlugin.autoImport.{activateCondaEnv, condaEnvLocation, createCondaEnvTask}
+import org.apache.commons.io.FileUtils
+import sbt.Keys._
+import sbt.{Def, _}
+import spray.json._
+
+object CodegenConfigProtocol extends DefaultJsonProtocol {
+ implicit val CCFormat: RootJsonFormat[CodegenConfig] = jsonFormat8(CodegenConfig.apply)
+}
+
+import CodegenConfigProtocol._
+
+case class CodegenConfig(name: String,
+ jarName: Option[String],
+ topDir: String,
+ targetDir: String,
+ version: String,
+ pythonizedVersion: String,
+ rVersion: String,
+ packageName: String)
+
+//noinspection ScalaStyle
+object CodegenPlugin extends AutoPlugin {
+ override def trigger = allRequirements
+
+ override def requires: Plugins = CondaPlugin
+
+ def rCmd(activateCondaEnv: Seq[String], cmd: Seq[String], wd: File, libPath: String): Unit = {
+ runCmd(activateCondaEnv ++ cmd, wd, Map("R_LIBS" -> libPath, "R_USER_LIBS" -> libPath))
+ }
+
+ val RInstall = Tags.Tag("rInstall")
+
+ object autoImport {
+ val pythonizedVersion = settingKey[String]("Pythonized version")
+ val rVersion = settingKey[String]("R version")
+ val genPackageNamespace = settingKey[String]("genPackageNamespace")
+ val genTestPackageNamespace = settingKey[String]("genTestPackageNamespace")
+ val codegenJarName = settingKey[Option[String]]("codegenJarName")
+ val testgenJarName = settingKey[Option[String]]("testgenJarName")
+ val codegenArgs = settingKey[String]("codegenArgs")
+ val testgenArgs = settingKey[String]("testgenArgs")
+
+
+ val targetDir = settingKey[File]("targetDir")
+ val codegenDir = settingKey[File]("codegenDir")
+
+ val codegen = TaskKey[Unit]("codegen", "Generate Code")
+ val testgen = TaskKey[Unit]("testgen", "Generate Tests")
+
+ val packageR = TaskKey[Unit]("packageR", "Generate roxygen docs and zip R package")
+ val publishR = TaskKey[Unit]("publishR", "publish R package to blob")
+ val testR = TaskKey[Unit]("testR", "Run testthat on R tests")
+
+ val packagePython = TaskKey[Unit]("packagePython", "Package python sdk")
+ val installPipPackage = TaskKey[Unit]("installPipPackage", "install python sdk")
+ val publishPython = TaskKey[Unit]("publishPython", "publish python wheel")
+ val testPython = TaskKey[Unit]("testPython", "test python sdk")
+
+ val mergePyCodeDir = SettingKey[File]("mergePyCodeDir")
+ val mergePyCode = TaskKey[Unit]("mergePyCode", "copy python code to a destination")
+ }
+
+ import autoImport._
+
+ override lazy val globalSettings: Seq[Setting[_]] = Seq(
+ Global / concurrentRestrictions += Tags.limit(RInstall, 1)
+ )
+
+ def testRImpl: Def.Initialize[Task[Unit]] = Def.task {
+ packageR.value
+ publishLocal.value
+ val libPath = join(condaEnvLocation.value, "Lib", "R", "library").toString
+ val rSrcDir = join(codegenDir.value, "src", "R", genPackageNamespace.value)
+ rCmd(activateCondaEnv.value,
+ Seq("R", "CMD", "INSTALL", "--no-multiarch", "--with-keep.source", genPackageNamespace.value),
+ rSrcDir.getParentFile, libPath)
+ val testRunner = join("tools", "tests", "run_r_tests.R")
+ if (join(rSrcDir,"tests").exists()){
+ rCmd(activateCondaEnv.value,
+ Seq("Rscript", testRunner.getAbsolutePath), rSrcDir, libPath)
+ }
+ } tag(RInstall)
+
+
+ override lazy val projectSettings: Seq[Setting[_]] = Seq(
+ publishMavenStyle := true,
+ codegenArgs := {
+ CodegenConfig(
+ name.value,
+ codegenJarName.value,
+ baseDirectory.value.getAbsolutePath,
+ targetDir.value.getAbsolutePath,
+ version.value,
+ pythonizedVersion.value,
+ rVersion.value,
+ genPackageNamespace.value
+ ).toJson.compactPrint
+ },
+ testgenArgs := {
+ CodegenConfig(
+ name.value,
+ testgenJarName.value,
+ baseDirectory.value.getAbsolutePath,
+ targetDir.value.getAbsolutePath,
+ version.value,
+ pythonizedVersion.value,
+ rVersion.value,
+ genPackageNamespace.value
+ ).toJson.compactPrint
+ },
+ codegenJarName := {
+ val art: Artifact = (Compile / packageBin / artifact).value
+ Some(artifactName.value(
+ ScalaVersion(scalaVersion.value, scalaBinaryVersion.value),
+ projectID.value,
+ art))
+ },
+ testgenJarName := {
+ val art: Artifact = (Test / packageBin / artifact).value
+ Some(artifactName.value(
+ ScalaVersion(scalaVersion.value, scalaBinaryVersion.value),
+ projectID.value,
+ art))
+ },
+ codegen := (Def.taskDyn {
+ (Compile / compile).value
+ (Test / compile).value
+ val arg = codegenArgs.value
+ Def.task {
+ (Compile / runMain).toTask(s" com.microsoft.ml.spark.codegen.CodeGen $arg").value
+ }
+ }.value),
+ testgen := (Def.taskDyn {
+ (Compile / compile).value
+ (Test / compile).value
+ val arg = testgenArgs.value
+ Def.task {
+ (Test / runMain).toTask(s" com.microsoft.ml.spark.codegen.TestGen $arg").value
+ }
+ }.value),
+ pythonizedVersion := {
+ if (version.value.contains("-")) {
+ version.value.split("-".head).head + ".dev1"
+ } else {
+ version.value
+ }
+ },
+ rVersion := {
+ if (version.value.contains("-")) {
+ version.value.split("-".head).head
+ } else {
+ version.value
+ }
+ },
+ packageR := {
+ createCondaEnvTask.value
+ codegen.value
+ val rSrcDir = join(codegenDir.value, "src", "R", genPackageNamespace.value)
+ val rPackageDir = join(codegenDir.value, "package", "R")
+ val libPath = join(condaEnvLocation.value, "Lib", "R", "library").toString
+ rCmd(activateCondaEnv.value, Seq("R", "-q", "-e", "roxygen2::roxygenise()"), rSrcDir, libPath)
+ rPackageDir.mkdirs()
+ zipFolder(rSrcDir, new File(rPackageDir, s"${name.value}-${version.value}.zip"))
+ },
+ testR := testRImpl.value,
+ publishR := {
+ codegen.value
+ packageR.value
+ val rPackageDir = join(codegenDir.value, "package", "R")
+ val rPackage = rPackageDir.listFiles().head
+ singleUploadToBlob(rPackage.toString, rPackage.getName, "rrr")
+ },
+ packagePython := {
+ codegen.value
+ createCondaEnvTask.value
+ val destPyDir = join(targetDir.value, "classes", genPackageNamespace.value)
+ val packageDir = join(codegenDir.value, "package", "python").absolutePath
+ val pythonSrcDir = join(codegenDir.value, "src", "python")
+ if (destPyDir.exists()) FileUtils.forceDelete(destPyDir)
+ val sourcePyDir = join(pythonSrcDir.getAbsolutePath, genPackageNamespace.value)
+ FileUtils.copyDirectory(sourcePyDir, destPyDir)
+ runCmd(
+ activateCondaEnv.value ++
+ Seq(s"python", "setup.py", "bdist_wheel", "--universal", "-d", packageDir),
+ pythonSrcDir)
+ },
+ installPipPackage := {
+ packagePython.value
+ publishLocal.value
+ runCmd(
+ activateCondaEnv.value ++ Seq("pip", "install", "-I",
+ s"${name.value.replace("-", "_")}-${pythonizedVersion.value}-py2.py3-none-any.whl"),
+ join(codegenDir.value, "package", "python"))
+ },
+ publishPython := {
+ publishLocal.value
+ packagePython.value
+ val fn = s"${name.value.replace("-", "_")}-${pythonizedVersion.value}-py2.py3-none-any.whl"
+ singleUploadToBlob(
+ join(codegenDir.value, "package", "python", fn).toString,
+ version.value + "/" + fn, "pip")
+ },
+ mergePyCode := {
+ val srcDir = join(codegenDir.value, "src", "python", genPackageNamespace.value)
+ val destDir = join(mergePyCodeDir.value, "src", "python", genPackageNamespace.value)
+ FileUtils.copyDirectory(srcDir, destDir)
+ },
+ testPython := {
+ installPipPackage.value
+ testgen.value
+ val mainTargetDir = join(baseDirectory.value.getParent, "target")
+ runCmd(
+ activateCondaEnv.value ++ Seq("python",
+ "-m",
+ "pytest",
+ s"--cov=${genPackageNamespace.value}",
+ s"--junitxml=${join(mainTargetDir, s"python-test-results-${name.value}.xml")}",
+ "--cov-report=xml",
+ genTestPackageNamespace.value
+ ),
+ new File(codegenDir.value, "test/python/")
+ )
+ },
+ targetDir := {
+ artifactPath.in(packageBin).in(Compile).value.getParentFile
+ },
+ mergePyCodeDir := {
+ join(baseDirectory.value.getParent, "target", "scala-2.12", "sbt-1.0", "generated")
+ },
+ codegenDir := {
+ join(targetDir.value, "generated")
+ },
+ genPackageNamespace := {
+ "mmlspark"
+ },
+ genTestPackageNamespace := {
+ "mmlsparktest"
+ }
+
+ )
+}
\ No newline at end of file
diff --git a/project/CondaPlugin.scala b/project/CondaPlugin.scala
new file mode 100644
index 00000000000..4e3e3ce005b
--- /dev/null
+++ b/project/CondaPlugin.scala
@@ -0,0 +1,56 @@
+import BuildUtils.{osPrefix, runCmd}
+import sbt._
+import Keys._
+
+import scala.sys.process.Process
+
+//noinspection ScalaStyle
+object CondaPlugin extends AutoPlugin {
+ override def trigger = allRequirements
+
+ object autoImport {
+ val condaEnvName = settingKey[String]("Name of conda environment")
+ val cleanCondaEnvTask = TaskKey[Unit]("cleanCondaEnv", "create conda env")
+ val condaEnvLocation = TaskKey[File]("condaEnvLocation", "get install location of conda env")
+ val createCondaEnvTask = TaskKey[Unit]("createCondaEnv", "create conda env")
+ val activateCondaEnv = settingKey[Seq[String]]("commands to activate conda environment")
+ }
+
+ import autoImport._
+ override lazy val globalSettings: Seq[Setting[_]] = Seq(
+ condaEnvName := "mmlspark",
+ cleanCondaEnvTask := {
+ runCmd(Seq("conda", "env", "remove", "--name", condaEnvName.value, "-y"))
+ },
+ condaEnvLocation := {
+ createCondaEnvTask.value
+ new File(Process("conda env list").lineStream.toList
+ .map(_.split("\\s+"))
+ .map(l => (l.head, l.reverse.head))
+ .filter(p => p._1 == condaEnvName.value)
+ .head._2)
+ },
+ createCondaEnvTask := {
+ val hasEnv = Process("conda env list").lineStream.toList
+ .map(_.split("\\s+").head).contains(condaEnvName.value)
+ if (!hasEnv) {
+ runCmd(Seq("conda", "env", "create", "-f", "environment.yaml"))
+ } else {
+ println("Found conda env " + condaEnvName.value)
+ }
+ },
+ activateCondaEnv := {
+ if (sys.props("os.name").toLowerCase.contains("windows")) {
+ osPrefix ++ Seq("activate", condaEnvName.value, "&&")
+ } else {
+ Seq()
+ //TODO figure out why this doesent work
+ //Seq("/bin/bash", "-l", "-c", "source activate " + condaEnvName, "&&")
+ }
+ }
+ )
+
+ override def requires: Plugins = sbt.Plugins.empty
+
+ override lazy val projectSettings: Seq[Setting[_]] = Seq()
+}
\ No newline at end of file
diff --git a/project/build.scala b/project/build.scala
index f7816cd5d48..06a930e33d1 100644
--- a/project/build.scala
+++ b/project/build.scala
@@ -2,8 +2,12 @@ import java.io.File
import java.lang.ProcessBuilder.Redirect
object BuildUtils {
+ def join(root: File, folders: String*): File = {
+ folders.foldLeft(root) { case (f, s) => new File(f, s) }
+ }
+
def join(folders: String*): File = {
- folders.tail.foldLeft(new File(folders.head)) { case (f, s) => new File(f, s) }
+ join(new File(folders.head), folders.tail: _*)
}
def isWindows: Boolean = {
@@ -27,7 +31,7 @@ object BuildUtils {
.redirectError(Redirect.INHERIT)
.redirectOutput(Redirect.INHERIT)
val env = pb.environment()
- envVars.foreach(p =>env.put(p._1,p._2))
+ envVars.foreach(p => env.put(p._1, p._2))
assert(pb.start().waitFor() == 0)
}
@@ -56,6 +60,7 @@ object BuildUtils {
"--account-key", Secrets.storageKey)
runCmd(osPrefix ++ command)
}
+
def singleUploadToBlob(source: String,
dest: String,
container: String,
@@ -76,6 +81,7 @@ object BuildUtils {
val (dirs, files) = dir.listFiles.sorted.partition(_.isDirectory)
(if (pred == null) files else files.filter(pred)) ++ dirs.flatMap(loop)
}
+
loop(dir)
}
@@ -91,7 +97,9 @@ object BuildUtils {
zip.putNextEntry(new ZipEntry(file.toString.substring(prefixLen).replace(java.io.File.separator, "/")))
val in = new BufferedInputStream(new FileInputStream(file), bufferSize)
var b = 0
- while (b >= 0) { zip.write(data, 0, b); b = in.read(data, 0, bufferSize) }
+ while (b >= 0) {
+ zip.write(data, 0, b); b = in.read(data, 0, bufferSize)
+ }
in.close()
zip.closeEntry()
}
diff --git a/project/plugins.sbt b/project/plugins.sbt
index cc082cf59b0..6f4bd427f23 100644
--- a/project/plugins.sbt
+++ b/project/plugins.sbt
@@ -4,4 +4,4 @@ addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.9.0")
addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.8")
addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.1")
addSbtPlugin("com.dwijnand" % "sbt-dynver" % "4.0.0")
-addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.0")
\ No newline at end of file
+addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.0")
diff --git a/src/main/python/setup.py b/src/main/python/setup.py
deleted file mode 100644
index 3ba8474be22..00000000000
--- a/src/main/python/setup.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (C) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License. See LICENSE in project root for information.
-
-import os
-from setuptools import setup, find_packages
-import codecs
-import os.path
-
-
-def read(rel_path):
- here = os.path.abspath(os.path.dirname(__file__))
- with codecs.open(os.path.join(here, rel_path), "r") as fp:
- return fp.read()
-
-
-def get_version(rel_path):
- for line in read(rel_path).splitlines():
- if line.startswith("__version__"):
- delim = '"' if '"' in line else "'"
- return line.split(delim)[1]
- return "0.0.0"
-
-
-setup(
- name="mmlspark",
- version=get_version("mmlspark/__init__.py"),
- description="Microsoft ML for Spark",
- long_description="Microsoft ML for Apache Spark contains Microsoft's open source "
- + "contributions to the Apache Spark ecosystem",
- license="MIT",
- packages=find_packages(),
- url="https://github.com/Azure/mmlspark",
- author="Microsoft",
- author_email="mmlspark-support@microsoft.com",
- classifiers=[
- "Development Status :: 3 - Alpha",
- "Intended Audience :: Developers",
- "Intended Audience :: Data Scientists",
- "Topic :: Software Development :: Datascience Tools",
- "License :: OSI Approved :: MIT License",
- "Programming Language :: Python :: 2",
- "Programming Language :: Python :: 3",
- ],
- zip_safe=True,
- package_data={"mmlspark": ["../LICENSE.txt", "../README.txt"]},
-)
diff --git a/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala b/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala
deleted file mode 100644
index 03785cbd8c9..00000000000
--- a/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (C) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License. See LICENSE in project root for information.
-
-package com.microsoft.ml.spark.codegen
-
-import java.io.File
-
-import com.microsoft.ml.spark.build.BuildInfo
-
-object Config {
- val DebugMode = sys.env.getOrElse("DEBUGMODE", "").trim.toLowerCase == "true"
-
- val TopDir = BuildInfo.baseDirectory
- val Version = BuildInfo.version
- val PackageName = BuildInfo.name
- val TargetDir = new File(TopDir, s"target/scala-${BuildInfo.scalaVersion.slice(0,4)}")
- val ScalaSrcDir = "src/main/scala"
-
- val GeneratedDir = new File(TargetDir, "generated")
- val PackageDir = new File(GeneratedDir, "package")
- val SrcDir = new File(GeneratedDir, "src")
- val TestDir = new File(GeneratedDir, "test")
- val DocDir = new File(GeneratedDir, "doc")
- val TestDataDir = new File(GeneratedDir, "test-data")
-
- //Python Codegen Constant
- val PySrcDir = new File(SrcDir, "python")
- val PyPackageDir = new File(PackageDir, "python")
- val PyTestDir = new File(TestDir, "python")
- val PySrcOverrideDir = new File(TopDir, "src/main/python")
- val PyTestOverrideDir = new File(TopDir, "src/test/python")
-
- //R Codegen Constants
- val RSrcRoot = new File(SrcDir, "R")
- val RSrcDir = new File(RSrcRoot, "mmlspark/R")
- val RPackageDir = new File(PackageDir, "R")
- val RTestDir = new File(RSrcRoot, "mmlspark/tests")
-
- val RTestOverrideDir = new File(TopDir, "src/test/R")
- val RSrcOverrideDir = new File(TopDir, "src/main/R")
-
- //val rPackageFile = new File(rPackageDir, s"mmlspark-$mmlVer.zip")
-
- val InternalPrefix = "_"
- val ScopeDepth = " " * 4
-
- val CopyrightLines =
- s"""|# Copyright (C) Microsoft Corporation. All rights reserved.
- |# Licensed under the MIT License. See LICENSE in project root for information.
- |""".stripMargin
-
- // The __init__.py file
- def packageHelp(importString: String): String = {
- s"""|$CopyrightLines
- |
- |"\""
- |MicrosoftML is a library of Python classes to interface with the
- |Microsoft scala APIs to utilize Apache Spark to create distibuted
- |machine learning models.
- |
- |MicrosoftML simplifies training and scoring classifiers and
- |regressors, as well as facilitating the creation of models using the
- |CNTK library, images, and text.
- |"\""
- |
- |__version__ = "${BuildInfo.pythonizedVersion}"
- |__spark_package_version__ = "${BuildInfo.version}"
- |
- |$importString
- |""".stripMargin
- }
-}
diff --git a/src/test/python/mmlsparktest/cyber/utils/__init__.py b/src/test/python/mmlsparktest/cyber/utils/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/src/test/python/mmlsparktest/nn/__init__.py b/src/test/python/mmlsparktest/nn/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/src/test/python/mmlsparktest/recommendation/__init__.py b/src/test/python/mmlsparktest/recommendation/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/src/test/python/mmlsparktest/spark.py b/src/test/python/mmlsparktest/spark.py
deleted file mode 100644
index 6100bdf6cd7..00000000000
--- a/src/test/python/mmlsparktest/spark.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (C) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License. See LICENSE in project root for information.
-
-from pyspark.sql import SparkSession, SQLContext
-import os
-import mmlspark
-
-spark = SparkSession.builder \
- .master("local[*]") \
- .appName("PysparkTests") \
- .config("spark.jars.packages", "com.microsoft.ml.spark:mmlspark_2.12:" + mmlspark.__spark_package_version__) \
- .config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven") \
- .config("spark.executor.heartbeatInterval", "60s") \
- .config("spark.sql.shuffle.partitions", 10) \
- .config("spark.sql.crossJoin.enabled", "true") \
- .getOrCreate()
-
-sc = SQLContext(spark.sparkContext)
diff --git a/src/test/python/mmlsparktest/vw/__init__.py b/src/test/python/mmlsparktest/vw/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKTestUtils.scala b/src/test/scala/com/microsoft/ml/spark/cntk/CNTKTestUtils.scala
deleted file mode 100644
index 4981013301c..00000000000
--- a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKTestUtils.scala
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (C) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License. See LICENSE in project root for information.
-
-package com.microsoft.ml.spark.cntk
-
-import java.io.File
-
-import com.microsoft.ml.spark.build.BuildInfo
-import com.microsoft.ml.spark.core.env.FileUtilities
-import com.microsoft.ml.spark.core.test.base.TestBase
-import com.microsoft.ml.spark.image.UnrollImage
-import org.apache.spark.ml.linalg.DenseVector
-import org.apache.spark.sql._
-import com.microsoft.ml.spark.io.IOImplicits._
-
-trait CNTKTestUtils extends TestBase {
-
- val filesRoot = BuildInfo.datasetDir.toString
- val imagePath = FileUtilities.join(filesRoot, "Images", "CIFAR").toString
- val modelPath = FileUtilities.join(filesRoot, "CNTKModel", "ConvNet_CIFAR10.model").toString
- val inputCol = "cntk_images"
- val outputCol = "out"
- val labelCol = "labels"
-
- val featureVectorLength = 3 * 32 * 32
- lazy val saveFile = new File(tmpDir.toFile, "spark-z.model").toString
-
- def testModelDF(spark: SparkSession): DataFrame = {
- import spark.implicits._
- spark.sparkContext.parallelize(Seq(
- Array(1.32165250, -2.1215112, 0.63150704, 0.77315974, -1.28163720,
- -0.20210080, -2.2839167, -2.08691480, 5.08418200, -1.33741090),
- Array(3.44079640, 1.4877119, -0.74059330, -0.34381202, -2.48724990,
- -2.62866950, -3.1693816, -3.14182600, 4.76314800, 0.68712880),
- Array(-1.88747900, -4.7685330, 0.15169683, 6.80547570, -0.38405967,
- 3.41065170, 1.3302778, -0.87714905, -2.18046050, -4.16661830),
- Array(5.01010300, 3.9860306, -1.36795600, -0.89830830, -4.49545430,
- -4.19537070, -4.4045380, -5.81759450, 6.93805700, 1.49001510),
- Array(-4.70754600, -6.0414960, 1.20658250, 5.40738300, 1.07661690,
- 4.71566440, 4.3834330, -1.57187440, -2.96569730, -5.43208270),
- Array(-1.23873880, -3.2042341, 2.54533000, 5.51954800, 2.89042470,
- 0.12380804, 3.8639085, -4.79466800, -2.41463420, -5.17418430))).toDF
- }
-
- def testImages(spark: SparkSession): DataFrame = {
- val images = spark.read.image.load(imagePath)
-
- val unroll = new UnrollImage().setInputCol("image").setOutputCol(inputCol)
-
- unroll.transform(images).select(inputCol)
- }
-
- def makeFakeData(spark: SparkSession, rows: Int, size: Int, outputDouble: Boolean = false): DataFrame = {
- import spark.implicits._
- if (outputDouble) {
- List
- .fill(rows)(List.fill(size)(0.0).toArray)
- .zip(List.fill(rows)(0.0))
- .toDF(inputCol, labelCol)
- } else {
- List
- .fill(rows)(List.fill(size)(0.0.toFloat).toArray)
- .zip(List.fill(rows)(0.0))
- .toDF(inputCol, labelCol)
- }
- }
-
- protected def compareToTestModel(result: DataFrame) = {
- //TODO improve checks
- assert(result.columns.toSet == Set(inputCol, outputCol))
- assert(result.count() == testModelDF(result.sparkSession).count())
- val max = result
- .select(outputCol)
- .collect()
- .map(row => row.getAs[DenseVector](0).toArray.max)
- .max
- assert(max < 10 & max > -10)
- }
-
-}
diff --git a/src/test/scala/com/microsoft/ml/spark/codegen/CodeGen.scala b/src/test/scala/com/microsoft/ml/spark/codegen/CodeGen.scala
deleted file mode 100644
index 67d667e339e..00000000000
--- a/src/test/scala/com/microsoft/ml/spark/codegen/CodeGen.scala
+++ /dev/null
@@ -1,176 +0,0 @@
-// Copyright (C) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License. See LICENSE in project root for information.
-
-package com.microsoft.ml.spark.codegen
-
-import java.io.File
-import com.microsoft.ml.spark.build.BuildInfo
-import com.microsoft.ml.spark.codegen.Config._
-import com.microsoft.ml.spark.core.env.FileUtilities._
-import com.microsoft.ml.spark.core.test.base.TestBase
-import com.microsoft.ml.spark.core.test.fuzzing.PyTestFuzzing
-import com.microsoft.ml.spark.core.utils.JarLoadingUtils.instantiateServices
-import org.apache.commons.io.FileUtils
-import org.apache.commons.io.FilenameUtils._
-
-object CodeGenUtils {
- def clean(dir: File): Unit = if (dir.exists()) FileUtils.forceDelete(dir)
-
- def toDir(f: File): File = new File(f, File.separator)
-}
-
-object CodeGen {
-
- import CodeGenUtils._
-
- def generatePythonClasses(): Unit = {
- instantiateServices[PythonWrappable].foreach { w =>
- w.makePyFile()
- }
- }
-
- def generateRClasses(): Unit = {
- instantiateServices[RWrappable].foreach { w =>
- w.makeRFile()
- }
- }
-
- private def makeInitFiles(packageFolder: String = ""): Unit = {
- val dir = new File(new File(PySrcDir, "mmlspark"), packageFolder)
- val packageString = if (packageFolder != "") packageFolder.replace("/", ".") else ""
- val importStrings =
- dir.listFiles.filter(_.isFile).sorted
- .map(_.getName)
- .filter(name => name.endsWith(".py") && !name.startsWith("_") && !name.startsWith("test"))
- .map(name => s"from mmlspark$packageString.${getBaseName(name)} import *\n").mkString("")
- writeFile(new File(dir, "__init__.py"), packageHelp(importStrings))
- dir.listFiles().filter(_.isDirectory).foreach(f =>
- makeInitFiles(packageFolder + "/" + f.getName)
- )
- }
-
- //noinspection ScalaStyle
- def generateRPackageData(): Unit = {
- // description file; need to encode version as decimal
- val today = new java.text.SimpleDateFormat("yyyy-MM-dd")
- .format(new java.util.Date())
-
- RSrcDir.mkdirs()
- writeFile(new File(RSrcDir.getParentFile, "DESCRIPTION"),
- s"""|Package: mmlspark
- |Title: Access to MMLSpark via R
- |Description: Provides an interface to MMLSpark.
- |Version: ${BuildInfo.rVersion}
- |Date: $today
- |Author: Microsoft Corporation
- |Maintainer: MMLSpark Team