apache · MLnick · Jun 15, 2016 · Jun 20, 2016 · Jun 20, 2016 · Jun 20, 2016
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -376,24 +376,48 @@ def foreachPartition(self, f):
 
     @since(1.3)
     def cache(self):
-        """ Persists with the default storage level (C{MEMORY_ONLY}).
+        """Persists the :class:`DataFrame` with the default storage level (C{MEMORY_AND_DISK}).
+
+        .. note:: the default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0.
         """
         self.is_cached = True
         self._jdf.cache()
         return self
 
     @since(1.3)
-    def persist(self, storageLevel=StorageLevel.MEMORY_ONLY):
-        """Sets the storage level to persist its values across operations
-        after the first time it is computed. This can only be used to assign
-        a new storage level if the RDD does not have a storage level set yet.
-        If no storage level is specified defaults to (C{MEMORY_ONLY}).
+    def persist(self, storageLevel=StorageLevel.MEMORY_AND_DISK):
+        """Sets the storage level to persist the contents of the :class:`DataFrame` across
+        operations after the first time it is computed. This can only be used to assign
+        a new storage level if the :class:`DataFrame` does not have a storage level set yet.
+        If no storage level is specified defaults to (C{MEMORY_AND_DISK}).
+
+        .. note:: the default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0.
         """
         self.is_cached = True
         javaStorageLevel = self._sc._getJavaStorageLevel(storageLevel)
         self._jdf.persist(javaStorageLevel)
         return self
 
+    @property
+    @since(2.0)
+    def storageLevel(self):
+        """Get the :class:`DataFrame`'s current storage level.
+
+        >>> df.storageLevel
+        StorageLevel(False, False, False, False, 1)
+        >>> df.cache().storageLevel
+        StorageLevel(True, True, False, True, 1)
+        >>> df2.persist(StorageLevel.DISK_ONLY_2).storageLevel
+        StorageLevel(True, False, False, False, 2)
+        """
+        java_storage_level = self._jdf.storageLevel()
+        storage_level = StorageLevel(java_storage_level.useDisk(),
+                                     java_storage_level.useMemory(),
+                                     java_storage_level.useOffHeap(),
+                                     java_storage_level.deserialized(),
+                                     java_storage_level.replication())
+        return storage_level
+
     @since(1.3)
     def unpersist(self, blocking=False):
         """Marks the :class:`DataFrame` as non-persistent, and remove all blocks for it from

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -2307,6 +2307,18 @@ class Dataset[T] private[sql](
     this
   }
 
+  /**
+   * Get the Dataset's current storage level, or StorageLevel.NONE if not persisted.
+   *
+   * @group basic
+   * @since 2.0.0
+   */
+  def storageLevel: StorageLevel = {
+    sparkSession.sharedState.cacheManager.lookupCachedData(this).map { cachedData =>
+      cachedData.cachedRepresentation.storageLevel
+    }.getOrElse(StorageLevel.NONE)
+  }
+
   /**
    * Mark the Dataset as non-persistent, and remove all blocks for it from memory and disk.
    *

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala
@@ -21,11 +21,32 @@ import scala.language.postfixOps
 
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.storage.StorageLevel
 
 
 class DatasetCacheSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
 
+  test("get storage level") {
+    val ds1 = Seq("1", "2").toDS().as("a")
+    val ds2 = Seq(2, 3).toDS().as("b")
+
+    // default storage level
+    ds1.persist()
+    ds2.cache()
+    assert(ds1.storageLevel == StorageLevel.MEMORY_AND_DISK)
+    assert(ds2.storageLevel == StorageLevel.MEMORY_AND_DISK)
+    // unpersist
+    ds1.unpersist()
+    assert(ds1.storageLevel == StorageLevel.NONE)
+    // non-default storage level
+    ds1.persist(StorageLevel.MEMORY_ONLY_2)
+    assert(ds1.storageLevel == StorageLevel.MEMORY_ONLY_2)
+    // joined Dataset should not be persisted
+    val joined = ds1.joinWith(ds2, $"a.value" === $"b.value")
+    assert(joined.storageLevel == StorageLevel.NONE)
+  }
+
   test("persist and unpersist") {
     val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS().select(expr("_2 + 1").as[Int])
     val cached = ds.cache()
@@ -39,8 +60,7 @@ class DatasetCacheSuite extends QueryTest with SharedSQLContext {
       2, 3, 4)
     // Drop the cache.
     cached.unpersist()
-    assert(spark.sharedState.cacheManager.lookupCachedData(cached).isEmpty,
-      "The Dataset should not be cached.")
+    assert(cached.storageLevel == StorageLevel.NONE, "The Dataset should not be cached.")
   }
 
   test("persist and then rebind right encoder when join 2 datasets") {
@@ -57,11 +77,9 @@ class DatasetCacheSuite extends QueryTest with SharedSQLContext {
     assertCached(joined, 2)
 
     ds1.unpersist()
-    assert(spark.sharedState.cacheManager.lookupCachedData(ds1).isEmpty,
-      "The Dataset ds1 should not be cached.")
+    assert(ds1.storageLevel == StorageLevel.NONE, "The Dataset ds1 should not be cached.")
     ds2.unpersist()
-    assert(spark.sharedState.cacheManager.lookupCachedData(ds2).isEmpty,
-      "The Dataset ds2 should not be cached.")
+    assert(ds2.storageLevel == StorageLevel.NONE, "The Dataset ds2 should not be cached.")
   }
 
   test("persist and then groupBy columns asKey, map") {
@@ -76,10 +94,8 @@ class DatasetCacheSuite extends QueryTest with SharedSQLContext {
     assertCached(agged.filter(_._1 == "b"))
 
     ds.unpersist()
-    assert(spark.sharedState.cacheManager.lookupCachedData(ds).isEmpty,
-      "The Dataset ds should not be cached.")
+    assert(ds.storageLevel == StorageLevel.NONE, "The Dataset ds should not be cached.")
     agged.unpersist()
-    assert(spark.sharedState.cacheManager.lookupCachedData(agged).isEmpty,
-      "The Dataset agged should not be cached.")
+    assert(agged.storageLevel == StorageLevel.NONE, "The Dataset agged should not be cached.")
   }
 }