microsoft · mhamilton723 · Jan 12, 2022 · Jan 12, 2022
@@ -15,7 +15,7 @@ trait DataBalanceTestBase extends TestBase {
 
   import spark.implicits._
 
-  lazy val errorTolerance: Double = 1e-12
+  lazy val errorTolerance: Double = 1e-8
 
   lazy val featureProbCol = "featureProb"
   lazy val positiveFeatureCountCol = "positiveFeatureCount"

@@ -57,8 +57,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import time\n",
     "imageStream = spark.readStream.image().load(imageDir)\n",
     "query = imageStream.select(\"image.height\").writeStream.format(\"memory\").queryName(\"heights\").start()\n",
+    "time.sleep(3)\n",
     "print(\"Streaming query activity: {}\".format(query.isActive))"
    ]
   },
@@ -120,7 +122,7 @@
    "outputs": [],
    "source": [
     "from PIL import Image\n",
-    "\n",
+    "import matplotlib.pyplot as plt\n",
     "data = images.take(3)    # take first three rows of the dataframe\n",
     "im = data[2][0]          # the image is in the first column of a given row\n",
     "\n",
@@ -129,8 +131,8 @@
     "print(\"height: {}, width: {}, OpenCV type: {}\".format(im.height, im.width, im.mode))\n",
     "\n",
     "arr = toNDArray(im)     # convert to numpy array\n",
-    "Image.fromarray(arr, \"RGB\")   # display the image inside notebook\n",
-    "print(images.count())"
+    "print(images.count())\n",
+    "plt.imshow(Image.fromarray(arr, \"RGB\"))   # display the image inside notebook\n"
    ]
   },
   {
@@ -157,7 +159,7 @@
     "small = tr.transform(images).select(\"transformed\")\n",
     "\n",
     "im = small.take(3)[2][0]                  # take third image\n",
-    "Image.fromarray(toNDArray(im), \"RGB\")   # display the image inside notebook"
+    "plt.imshow(Image.fromarray(toNDArray(im), \"RGB\"))   # display the image inside notebook"
    ]
   },
   {
@@ -188,7 +190,7 @@
     "noblue = small.withColumn(\"noblue\", noBlueUDF(small[\"transformed\"])).select(\"noblue\")\n",
     "\n",
     "im = noblue.take(3)[2][0]                # take second image\n",
-    "Image.fromarray(toNDArray(im), \"RGB\")   # display the image inside notebook"
+    "plt.imshow(Image.fromarray(toNDArray(im), \"RGB\"))   # display the image inside notebook"
    ]
   },
   {

@@ -33,7 +33,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/BreastCancer.parquet\")\n",
+    "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/BreastCancer.parquet\").cache()\n",
     "tune, test = data.randomSplit([0.80, 0.20])\n",
     "tune.limit(10).toPandas()"
    ]

@@ -73,7 +73,7 @@
    },
    "outputs": [],
    "source": [
-    "df = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\")\n",
+    "df = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\").cache()\n",
     "\n",
     "labelIndexer = StringIndexer(inputCol=\"income\", outputCol=\"label\", stringOrderType=\"alphabetAsc\").fit(df)\n",
     "print(\"Label index assigment: \" + str(set(zip(labelIndexer.labels, [0, 1]))))\n",
@@ -427,4 +427,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
@@ -76,7 +76,7 @@
     "labelIndexer = StringIndexer(inputCol=\"income\", outputCol=\"label\", stringOrderType=\"alphabetAsc\").fit(df)\n",
     "print(\"Label index assigment: \" + str(set(zip(labelIndexer.labels, [0, 1]))))\n",
     "\n",
-    "training = labelIndexer.transform(df)\n",
+    "training = labelIndexer.transform(df).cache()\n",
     "display(training)\n",
     "categorical_features = [\n",
     "    \"workclass\",\n",
@@ -318,4 +318,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
@@ -74,9 +74,10 @@
     "    spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/BookReviewsFromAmazon10K.parquet\")\n",
     "    .withColumn(\"label\", (col(\"rating\") > 3).cast(LongType()))\n",
     "    .select(\"label\", \"text\")\n",
+    "    .cache()\n",
     ")\n",
     "\n",
-    "data.limit(10).toPandas()"
+    "display(data)"
    ]
   },
   {
@@ -279,4 +280,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}